[Cryptography] RFC possible changes for Linux random device

Fri Sep 12 19:21:39 EDT 2014

Sandy Harris <sandyinchina at gmail.com> wrote:

> I have some experimental code to replace parts of random.c ...

> Next two posts will be the main code and a support program it uses.

/*
    Experimental code to replace parts of random.c

    I change nothing on the input side; the
    whole entropy collection and estimation
    part of existing code are untouched. The
    hashing and output routines, though, are
    completely replaced, and much of the
    initialisation code is modified.

    Uses 128-bit hash from AES-GCM instead of
    160-bit SHA-1.

    This sort of hash-like primitive has largely
    replaced more complex hashes in applications
    such as IPsec authentication; in most cases
    the new methods are considerably faster and
    the code is simpler. It therefore seemed
    worth trying such a hash here. I chose the
    one from AES-GCM because it is widely used,
    well-analysed, and considered secure.

    References include RFCs 4106 and 5288, NIST
    standard SP-800-38D and many academic papers.
    https://eprint.iacr.org/2013/157.pdf discusses
    bugs in the Open SSL version of this hash.
    Intel have a reference on doing it faster on
    CPUs with a new instruction PCLMULQDQ.

    Whether it is secure for this applicatiion
    needs analysis. One concern is that IPsec
    generates a 128-bit hash but uses only 96
    bits in the authentication which makes some
    attacks much harder. This application uses
    the whole 128 bits. Also, the input for
    IPsec authentication is ciphertext, which
    is highly random with any decent cipher;
    input here is pool data which may be much
    less random.

    I add some complications beyond the basic
    hash; those need analysis as well.

    Changing the hash also allows other changes.
    One design goal was improved decoupling so
    that heavy use of /dev/urandom does not
    deplete the entropy pool for /dev/random.
    Another was simpler mixing in of additional
    data in various places. See later comments
    for details.

    Sandy Harris, sandyinchina at gmail.com
    September 2014
*/

/*
    memcpy(), memset(), ...
*/
#include <string.h>

/*
    Just for convenience
*/

#include <stdio.h>
#include <stdint.h>

typedef uint32_t u32 ;
typedef unsigned char byte ;

/*
    define if arch_get_random_long() is available
#define HAVE_HW_RAND
*/

/*
    Give every compiled version a different seed using
    bits from /dev/urandom on the development machine.

    This falls well short of the ideal solution, giving
    every installation a different seed. For that, see:
    http://www.av8n.com/computer/htm/secure-random.htm#sec-boot-image

    On the other hand, neither sort of seed is necessary if
      either you have a trustworthy hardware RNG
      or /dev/random is initialised early with secure stored data
    In those cases, the device gets adequately initialised.

    This is just a defense-in-depth trick, can do no harm
    and may sometimes make attacks harder. Not ideal, not
    always necessary, but a cheap addition and probably the
    best we can do at compile time.

    random_init.h sets three #defines
      POOL_ROWS    128-bit rows per pool (4)
      ARRAY_ROWS    (POOL_ROWS*3)
      ARRAY_WORDS    (ARRAY_ROWS*4)
    creates
      u32 init_data[ARRAY_WORDS]
    and fills it with random data.

    See gen_random_init.c for details

    The choice of 12 128-bit quantities for this is
    partly arbitrary, partly reasoned.

     On the one hand, it would be possible to do more
     here, e.g. initialising all the pools randomly
     since using /dev/urandom on the development
     machine is presumably cheap.

     On the other, 128 or 256 random bits are almost
     certainly enough.

    The AES-GCM hash initialises its accumulator to
    128 zero bits and uses only one constant, H. I
    chose instead to use two constants, initialise
    with one and use the other in the role of H.

    Then I chose to use two hash iterations. The output
    of the first is used as feedback into the pool and
    as input for the second hash. The second, using
    different constants, then generates the actual
    output; this is intended to make the feedback and
    the output largely independent.

    This requires that a total of four 128-bit constants
    be used in each output operation. I chose to give
    each pool its own four instead of using one set for
    all pools, so there are twelve altogether. Finally,
    I chose to initialise all twelve with random data.

    Any of those choices might be changed, but they
    all seem reasonable to me.
*/

#include "random_init.h"

/*
    copy some things from random.c
    just enough for testing my code
    modified a bit
*/

/*
 * Configuration information
 */
#define INPUT_POOL_SHIFT    12
#define INPUT_POOL_WORDS    (1 << (INPUT_POOL_SHIFT-5))
#define INPUT_POOL_BYTES    (INPUT_POOL_WORDS<<2)
#define OUTPUT_POOL_SHIFT    10
#define OUTPUT_POOL_WORDS    (1 << (OUTPUT_POOL_SHIFT-5))
#define OUTPUT_POOL_BYTES    (OUPUT_POOL_WORDS<<2)

static u32 input_pool_data[INPUT_POOL_WORDS];
static u32 blocking_pool_data[OUTPUT_POOL_WORDS];
static u32 nonblocking_pool_data[OUTPUT_POOL_WORDS];

/*
    reduced version of struct
    OK for testing this code

    would need integration with existing struct
    before actual use
*/
struct my_pool    {
    // used in hash & output mixing
    u32 *A, *B, *C, *D ;
    int which, count, entropy_count ;
    // used for mixing data back into pool
    u32 *p, *q, *end, delta ;
    // actual pool data
    u32 *data, size ;
    } ;

static struct my_pool input_pool, blocking_pool, nonblocking_pool;

static void simple_get( struct my_pool *, u32 * ) ;
static void mix_last( struct my_pool *, u32 * ) ;

/***************************************************************************
*    various operations on 128-bit buffers
****************************************************************************/

// xor one buffer into another
static inline void xor128( u32 *target, u32 *source)
{
    int i ;
    for( i = 0 ; i < 4 ; i++ )
        *target++ ^= *source++ ;
}

// not a 128-bit addition, just four 32-bit
static inline void add128(u32 *target, u32 *source)
{
    int i ;
    for( i = 0 ; i < 4 ; i++ )
        *target++ += *source++ ;
}

/*
    zero out a 128-bit buffer

    both buffer2pool() and buffer2array()
    call this to clear their inputs

    in the kernel context
    should use memzero_explicit()
*/
static inline void zero128( u32 *target )
{
    // printf( "zero128() %08x\n", target );
    memset( (void *) target, 0, 16) ;
}

/*
    4-way pseudo-Hadamard transform
    using 32-bit chunks

    every output word depends on every input word
    any size PHT is invertible
    it loses no entropy

    conceptually, a 2-way PHT on a, b is
        x = a + b
        y = a + 2b
        a = x
        b = y
    a better implementation is just
        a += b
        b += a

    a 4-way PHT is built from 4 2-way PHTs
    here it is unrolled into 8 += operations
*/
static void pht128(u32 *t)
{
    // printf( "pht128()\n" ) ;
    t[0] += t[1] ;
    t[1] += t[0] ;
    t[2] += t[3] ;
    t[3] += t[2] ;
    t[0] += t[2] ;
    t[2] += t[0] ;
    t[1] += t[3] ;
    t[3] += t[1] ;
}

/*
    mixer from the Aria block cipher, a Korean standard

    Cipher home page:
    http://210.104.33.10/ARIA/index-e.html
    See also RFC 5794

    Version here is based on GPL source at:
    http://www.oryx-embedded.com/doc/aria_8c_source.html

    mixes a 128-bit vector
    equivalent to a Boolean matrix multiplication
    every output byte depends on seven input bytes
    invertible; loses no entropy

     fairly efficient
     it is used in every round of the cipher
     there are 16 rounds & the cipher is fast

    some caution is needed in applying this since the
    function is its own inverse; using it twice on the
    same data gets you right back where you started
 */
static void aria_mix( byte *x )
{
    byte y[16] ;

    // printf( "aria_mix()\n" ) ;

    // each output byte is the XOR of 7 input bytes
    y[0] = x[3] ^ x[4] ^ x[6] ^ x[8] ^ x[9] ^ x[13] ^ x[14];
    y[1] = x[2] ^ x[5] ^ x[7] ^ x[8] ^ x[9] ^ x[12] ^ x[15];
    y[2] = x[1] ^ x[4] ^ x[6] ^ x[10] ^ x[11] ^ x[12] ^ x[15];
    y[3] = x[0] ^ x[5] ^ x[7] ^ x[10] ^ x[11] ^ x[13] ^ x[14];
    y[4] = x[0] ^ x[2] ^ x[5] ^ x[8] ^ x[11] ^ x[14] ^ x[15];
    y[5] = x[1] ^ x[3] ^ x[4] ^ x[9] ^ x[10] ^ x[14] ^ x[15];
    y[6] = x[0] ^ x[2] ^ x[7] ^ x[9] ^ x[10] ^ x[12] ^ x[13];
    y[7] = x[1] ^ x[3] ^ x[6] ^ x[8] ^ x[11] ^ x[12] ^ x[13];
    y[8] = x[0] ^ x[1] ^ x[4] ^ x[7] ^ x[10] ^ x[13] ^ x[15];
    y[9] = x[0] ^ x[1] ^ x[5] ^ x[6] ^ x[11] ^ x[12] ^ x[14];
    y[10] = x[2] ^ x[3] ^ x[5] ^ x[6] ^ x[8] ^ x[13] ^ x[15];
    y[11] = x[2] ^ x[3] ^ x[4] ^ x[7] ^ x[9] ^ x[12] ^ x[14];
    y[12] = x[1] ^ x[2] ^ x[6] ^ x[7] ^ x[9] ^ x[11] ^ x[12];
    y[13] = x[0] ^ x[3] ^ x[6] ^ x[7] ^ x[8] ^ x[10] ^ x[13];
    y[14] = x[0] ^ x[3] ^ x[4] ^ x[5] ^ x[9] ^ x[11] ^ x[14];
    y[15] = x[1] ^ x[2] ^ x[4] ^ x[5] ^ x[8] ^ x[10] ^ x[15];
    memcpy( x, y, 16 ) ;
    memset( y, 0, 16 ) ;
}

/********************************************************************
    code to manage the array of four 128-bit constants per pool
    not really constants; some of this code changes them
    treated as constants in the extract-from-pool code
*********************************************************************/

/*
    mixers to change things in the array
    two versions, XOR-then-add and add-then-XOR

    neither can reduce entropy, even with bad inputs
*/
static void mix2array1( u32 *row, u32 *x)
{
    // stir old data, using XOR
    aria_mix( (byte *) row ) ;
    // mix in new, using 32-bit addition
    add128( row, x ) ;
}

static void mix2array2( u32 *row, u32 *x)
{
    // stir old data, using addition
    pht128( row ) ;
    // mix in new, using XOR
    xor128( row, x ) ;
}

/*
    iterative updater
    call one or other of the above
    to update some part of the array data

    A and C are used in most of the hashing
    B and D are used for final output in mix_last()
    This updates them all in alphabetical order

    every call to this affects all future outputs
     from the pool

    any two successive calls change one of A or C,
     affecting all future feedback as well
*/
static void buffer2array( struct my_pool *p, u32 *x)
{
    u32 *row ;

    // printf( "buffer2array() p %08x input %08x\n", p, x ) ;

    // different row on each call
    switch( p->which & 3 )    {
        case 0:
            row = p->A ;
            break ;
        case 1:
            row = p->B ;
            break ;
        case 2:
            row = p->C ;
            break ;
        case 3:
            row = p->D ;
            break ;
    }

    // alternate between update types
    if( p->which > 3 )
        mix2array1( row, x ) ;
    else
        mix2array2( row, x ) ;

    // sanitize
    zero128( x ) ;

    // update counter
    p->which++ ;
    if( p->which > 7 )
        p->which = 0 ;
}

/*
    update one or more constants for a pool
    using output from that pool

    pool is also stirred
    all output routines call mix_last()
    that calls buffer2pool() which mixes in 128 bits
    and changes 8 32-bit words in the pool
*/

static void mix2const( struct my_pool *p, int n )
{
    u32 temp[4] ;

    while( n-- > 0)    {
        simple_get( p, temp ) ;
        buffer2array( p, temp ) ;
    }
}

/****************************************************************
    Code to mix a 512-bit chunk of memory, 16 32-bit words
    based on part of Bernstein's ChaCha stream cipher code

    chacha_array() mixes the constants array for a pool
    stir_array() adds data then calls chacha_array()

    chacha_mix() could mix any 512-bit chunk of data
    current code uses it only to implement *_array()
*****************************************************************/

/*
    Bernstein comment & code

    chacha-ref.c version 20080118
    D. J. Bernstein
    Public domain.

#define QUARTERROUND(a,b,c,d) \
  x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \
  x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \
  x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \
  x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7);

static void salsa20_wordtobyte(byte output[64],const u32 input[16])
{
  u32 x[16];
  int i;

  for (i = 0;i < 16;++i) x[i] = input[i];
  for (i = 12;i > 0;i -= 2) {
    QUARTERROUND( 0, 4, 8,12)
    QUARTERROUND( 1, 5, 9,13)
    QUARTERROUND( 2, 6,10,14)
    QUARTERROUND( 3, 7,11,15)
    QUARTERROUND( 0, 5,10,15)
    QUARTERROUND( 1, 6,11,12)
    QUARTERROUND( 2, 7, 8,13)
    QUARTERROUND( 3, 4, 9,14)
  }
  for (i = 0;i < 16;++i) x[i] = PLUS(x[i],input[i]);
  for (i = 0;i < 16;++i) U32TO8_LITTLE(output + 4 * i,x[i]);
}
*/

// my version

#define ROTL(v, n) ( ((v) << (n)) | ((v) >> (32 - (n))) )

static void quarterround( u32 *x, int a, int b, int c, int d )
{
  x[a] += x[b] ; x[d] ^= x[a] ; x[d] = ROTL( x[d], 16) ;
  x[c] += x[d] ; x[b] ^= x[c] ; x[b] = ROTL( x[b], 12) ;
  x[a] += x[b] ; x[d] ^= x[a] ; x[d] = ROTL( x[d],  8) ;
  x[c] += x[d] ; x[b] ^= x[c] ; x[b] = ROTL( x[b],  7) ;
}

/*
    mix a 512-bit buffer in place
    treat it as a 4x4 array of 32-bit words

    ChaCha uses 8, 12 or 20 rounds
    this does 4 * n
*/
static void chacha_mix( u32 *data, int n )
{
    u32 x[16] ;
    int i;

    // copy data[] into x[]
    memcpy( (byte *) x, (byte *) data, 64) ;

    // stir x[]
     for( i = 4*n ; i > 0; i -= 2) {
        // mix rows
        quarterround(  x, 0, 4, 8,12 ) ;
        quarterround(  x, 1, 5, 9,13 ) ;
        quarterround(  x, 2, 6,10,14 ) ;
        quarterround(  x, 3, 7,11,15 ) ;
        // mix columns
        quarterround(  x, 0, 5,10,15 ) ;
        quarterround(  x, 1, 6,11,12 ) ;
        quarterround(  x, 2, 7, 8,13 ) ;
        quarterround(  x, 3, 4, 9,14 ) ;
    }

    /*
    add x[] back into data[]
    Bernstein uses bytewise addition
    I use 32-bit, maybe faster & carries improve mixing
    */
    for( i = 0 ; i < 16 ; i++ )
        data[i] += x[i] ;

    // sanitize
    memset( (byte *) x, 0, 64) ;
}

/*
    apply chacha_mix() to the constants array for a pool
*/
#define N_CHACHA  2

static void chacha_array( struct my_pool *p )
{
    // printf("chacha_array()\n") ;
    chacha_mix( p->A, N_CHACHA ) ;
}

/*
    mix 128-bit chunks into two rows
    then mix whole array
*/
static void stir_array( struct my_pool *p )
{
    // printf("stir_array(), p is %08x\n", p) ;
    mix2const( p, 2 ) ;
    chacha_array( p ) ;
}

/*****************************************************************
    a 128-bit counter to mix in when hashing
******************************************************************/

/*
    There is only one counter[] and one count() function to
    update it. mix_last() calls that so the count is affected
    by all output operations on any pool. It is initialised
    randomly and only affects outputs indirectly. The counter
    value should therefore be quite difficult for an enemy to
    discover.

    mix_last() also mixes in the counter so it affects all
    output from any pool and all feedback into any pool.

    That is, this counter provides a way for operations on
    any pool to automatically influence both of the other
    pools, albeit in an indirect and rather limited way.

    Operations on this counter do not affect the per-pool
    counts for any pool, neither the entropy count nor
    the p->count iteration counter.
*/

// this should be initialised randomly
static u32 counter[4] ;

// constant from SHA-1
#define COUNTER_DELTA 0x67452301

static int iter_count = 0 ;

/*
    code is based on my own work in the Enchilada cipher:
    https://aezoo.compute.dtu.dk/doku.php?id=enchilada

    Mix operations so Hamming weight changes more than for
    a simple counter. This may not be strictly necessary,
    but low Hamming weight differences do allow some attacks
    on block ciphers and the high bits of a large counter
    that is only incremented do not change for aeons. The
    extra code here is cheap insurance.

    A bit nonlinear since it uses +, XOR and rotation.

    For discussion, see mailing list thread starting at:
    http://www.metzdowd.com/pipermail/cryptography/2014-May/021345.html
*/
static void count(void)
{
    // printf( "count(), iter_count %d\n", iter_count ) ;
    /*
    Limit the switch to < 256 cases
    should work with any CPU & compiler

    Five constants used, all primes
    roughly evenly spaced, around 50, 100, 150, 200, 250
    */
    switch( iter_count )    {
        /*
        mix three array elements
        each element is used twice
        once on left, once on right
        pattern is circular
        */
        case 47:
            counter[1] += counter[2] ;
            break ;
        case 101:
            counter[2] += counter[3] ;
            break ;
        case 197:
            counter[3] += counter[1] ;
            break ;
        /*
        inject counter[0] into that loop
        loop and counter[0] use +=
        so use ^ here
        */
        case 149:
            counter[1] ^= counter[0] ;
            break ;
        /*
        restart loop
        include a rotation for nonlinearity
        */
        case 251:
            counter[0] = ROTL( counter[0], 5) ;
            iter_count = -1 ;
            break ;
        /*
        for 246 out of every 251 iterations
        the switch does nothing
        */
        default:
            break ;
    }
    /*
    counter[0] is almost purely a counter
    += instead of ++ to change Hamming weight more

    nothing above affects it
    except the rotation every 251 iterations
    */
    counter[0] += COUNTER_DELTA ;
    iter_count++ ;
}

/******************************************************************
    manage pools, extract data from them
*******************************************************************/

/*
    basic mixing routine for pools

    mixes in 32 bits of data
    changes two pool elements per call
    mixes XOR and + for nonlinearity

    for when r is known to point to
    high-entropy data
        hardware RNG data
        hash output
        cipher output (not used here)

    input mixing should NOT use this
    existing driver code is far better for
    low-to-medium entropy inputs

    Existing code is OK for high-entropy
    stuff as well. I added this in hopes
    it would be faster. Also, adding a
    different mixer gives insurance if a
    weakness turns up in the existing one.
*/
static inline void mixer( u32 *p, u32 *q, u32 *r)
{
    /*
    mix in external data
    this cannot reduce entropy, even with bad data
    */
    *p ^= *r ;
    /*
    pseudo-Hadamard transform
    spread the effect a bit
    invertible, cannot lose entropy
    */
    *q += *p ;
    *p += *q ;
}

/*
    actual pool mixer for 32-bit chunks
    mixer() above plus pointer management

    if array size & inilialisation are sensible
    and array is N 32-bit words, then:

    after N/2 iterations,
    every array element has been changed

    after N iterations,
    every element has been changed twice
    once as *p, once as *q

    Eventually this stirs the entire pool,
    making every pool word depend both on
    every other pool word and on many
    external inputs. This is the only
    stirring the output pools get.

    If that is considered not fast enough,
    aria_mix(), pht128() or chacha_mix()
    could provide additional stirring.
    I do not think that is necessary.
*/
static inline void mix2pool( struct my_pool *pool, u32 *r)
{
    // printf( "mix2pool() %08x %08x %08x %08x\n", pool->data ,
pool->p, pool->q, pool->end) ;
    mixer(pool->p, pool->q, r) ;
    // increment pointers
    pool->p += pool->delta ;
    pool->q += pool->delta ;
    // wrap around if needed
    if( pool->p >= pool->end )
        pool->p -= pool->size ;
    if( pool->q >= pool->end )
        pool->q -= pool->size ;
}

/*
    mix a 128-bit buffer into a pool
    changes 8 32-bit pool words
    then clears its input buffer

    This is used in mix_last()
    to mix feedback data into the pool itself

    for other mixing, buffer2array() is preferred
    because the effects are more easily analysed
*/
static void buffer2pool( struct my_pool *pool, u32 *r)
{
    int i ;
    u32 *s ;

    // printf( "buffer2pool(), data at %08x\n", r ) ;

    for( i = 0, s = r ; i < 4 ; i++, s++ )
        mix2pool( pool, s ) ;
    zero128( r ) ;
}

/*******************************************************************
    hashing code
*******************************************************************/

/*
    AES-GCM authentication code

    from Dan Bernstein's example implementation
    distributed as part of CAESAR test code
    http://competitions.cr.yp.to/caesar.html

    I changed his 64-bit length type to u32
    enough for this application

Bernstein's description:

    a = (a + x) * y in the finite field
    16 bytes in a
    xlen bytes in x; xlen <= 16; x is implicitly 0-padded
    16 bytes in y
*/
static void addmul(byte *a, const byte *x, u32 xlen, const byte *y)
{
  int i;
  int j;
  byte abits[128];
  byte ybits[128];
  byte prodbits[256];
  for (i = 0;i < xlen;++i) a[i] ^= x[i];
  for (i = 0;i < 128;++i) abits[i] = (a[i / 8] >> (7 - (i % 8))) & 1;
  for (i = 0;i < 128;++i) ybits[i] = (y[i / 8] >> (7 - (i % 8))) & 1;
  /*
  splint(1) complains here about int assigned to byte
  for (i = 0;i < 256;++i) prodbits[i] = 0;
  so replace it with next line, maybe faster too?
  */
  memset( prodbits, 0, 256 ) ;
  for (i = 0;i < 128;++i)
    for (j = 0;j < 128;++j)
      prodbits[i + j] ^= abits[i] & ybits[j];
  for (i = 127;i >= 0;--i) {
    prodbits[i] ^= prodbits[i + 128];
    prodbits[i + 1] ^= prodbits[i + 128];
    prodbits[i + 2] ^= prodbits[i + 128];
    prodbits[i + 7] ^= prodbits[i + 128];
    prodbits[i + 128] ^= prodbits[i + 128];
  }
  /*
  for (i = 0;i < 16;++i) a[i] = 0;
  */
  memset( a, 0, 16 ) ;
  for (i = 0;i < 128;++i) a[i / 8] |= (prodbits[i] << (7 - (i % 8)));
}

/*
    Mix n bytes into an accumulator using addmul()

    This is a keyed hash that takes nbytes of input,
    a 128-bit initial value and 128-bit key, and
    gives a 128-bit output.

    This routine does not either initialise the
    accumulator or finalise output. The expected
    calling sequence looks like this:

        intialise accumulator (from p->A)
        call this one or more times (with p->C)
          each call with different data
        finalise output (using p->B, C, D)

    The main use here is against the various pools
    replacing the hash previously used there. This
    should be faster, but it needs analysis.

    Note that it can be used with any data. In
    AES-GCM it is run over unencrypted headers so
    those can be authenticated along with encrypted
    text.

    Here it might be run over any kernel data
    structure that is expected to be unpredictable
    to an enemy, giving extra entropy.

    It can also be run over anything expected to
    be different on each machine (e.g. Ethernet
    MACs), on each boot (clock data) or on each
    read of /dev/urandom (process info for reader).

    Such data cannot be trusted for entropy (it
    may be unknown to some attackers, but not
    reliably to all), but it can still be useful
     in a role like that of salt in a hash; it
    makes brute force attacks much harder.
*/
static void mix_in( byte *data, u32 nbytes, byte *mul, u32 *accum)
{
    u32 len, left ;
    byte *p ;
    for( p = data, left = nbytes ; left != 0 ; p += len, left -= len)    {
        len = (left >= 16) ? 16 : left ;
        addmul( (byte *) accum, p, len, mul ) ;
    }
}

/*
    start of every output routine

    this may update the constants array for the pool
    based on p->count, a per-pool count different
    from global counter[] and count()

    this is not needed often since there is other mixing

    The Schneier et al Yarrow rng design uses 3DES
    with a 64-bit output on the output side. They
    reseed that from its own output every 10 blocks
    Here we have feedback into the pool on every
    iteration so we need not reseed as often.

    For the input and nonblocking pools, this is
    the only code that changes p->count.
    For the input pool, it is also the only code
    that uses p->count

    the nonblocking pool uses its p->count to decide
    when rekeying is needed. To decide where to get
    rekeying data, it looks at entropy count for the
    input pool and p->count for the nonblocking pool

    the blocking pool uses its p->count differently
    code in loop_nonblock_p()

    constants defined here are more-or-less arbitrary
    257 is 2^8+1 which divides 2^16-1
    so one constant is updated
    shortly before we reach MAX_COUNT at 2^16+1
*/
#define MAX_COUNT ((1<<16)+1)
#define FREQUENCY 257

static void mix_first( struct my_pool *p, u32 *buffer )
{
    u32 temp[4] ;
    p->count++ ;

    // printf( "mix_first()\n" ) ;

    if( (p->count % FREQUENCY) == 0 )    {
        // update one array element
        memcpy( (byte *) temp, (byte *) p->A, 16 ) ;
        mix_last( p, temp) ;
        buffer2array( p, temp ) ;
    }
    if( p->count >= MAX_COUNT )    {
        // update the whole constants array
        stir_array( p ) ;
        p->count = 0 ;
    }

    // initialise the buffer
    memcpy( (byte *) buffer, (byte *) p->A, 16 ) ;

    // MAYBE ADD CODE HERE?
}

/*
    Last step of any pool mixing sequence

    AES-GCM authentication is

      initialise accumulator all-zero
      mix in data with multiplier H
      xor in H before final output

    Our algorithm is

      call mix_first()
        maybe update the constants array
        initialise accumulator from p->A
      maybe do some other things
      call mix_last() for output

    What mix_last() does is

      mix in counter[] with multiplier p->C
      mix in pool data with multiplier p->C
      xor in p->C

      feed result back into pool

      re-initialise accumulator from p->B
      mix in previous result with multiplier p->D
      xor in p->D to get final output

    Both in the hashing and in the output mix,
    the same constant (C or D respectively) is
    used twice, in finite field multiplication
    then in an XOR. This is like the way AES-GCM
    authentication uses its constant H.

    It is also similar to a method of constructing
    a hash from a block cipher that XORs the cipher
    key into the output. Preneel, Govaerts and
    Vandewalle give a security proof for that.

    Their proof does not apply here. For one thing,
    they assume the block cipher is secure, but just
    multiplying by a key almost certainly does not
    give a secure cipher. However, their work does
    provide an argument that this construction is
    sensible.
*/
static void mix_last( struct my_pool *p, u32 *buffer )
{
    u32 temp[4] ;

    // printf( "mix_last()\n" ) ;

    // MAYBE ADD CODE HERE?

    // mix in counter and update it
    addmul( (byte *) buffer, (byte *) counter, 16, (byte *) p->C) ;
    count() ;

    // then pool data
    mix_in( (byte *) p->data, p->size, (byte *) p->C, buffer ) ;
    xor128( buffer, p->C ) ;

    // save result for later use
    memcpy( temp, buffer, 16 ) ;
    // feed results back into pool
    buffer2pool( p, buffer ) ;

    /*
    create an output different from data used in feedback
    using another hash step with different constants B, D
    */
    memcpy( buffer, p->B, 16 ) ;
    // mix in saved data
    addmul( (byte *) buffer, (byte *) temp, 16, (byte *) p->D) ;
    xor128( buffer, p->D ) ;

    // sanitize
    zero128( temp ) ;
}

/*
    The comments // MAYBE ADD CODE HERE? above
    indicate places where it would be logical
    to mix in timer data, hardware rng data, or
    data from something that uses CPU timing,
    such as Havege or Stephan Mueller's jitter.
    If we are running in a VM, data from the
    parent OS.

    Doing this in either mix_first() or mix_last()
    would ensure it was used for every output.

    The data could be put where convenient
    either in the pool or the output buffer
    either way it would affect both feedback & output

    "MAYBE" because there are other alternatives

    Just mix the data into the input pool and
    let the effects propagate from there?
    Existing code has add_timer_randomness() and
    related things to deal with the timer. If it
    ain't broke, ...

    Mix it in as part of the code to extract
    output from the input pool? This would affect
    all pools since that process also feeds data
    back into the input pool. It would be cheaper
    than doing it in mix_first() or mix_last().

    For now, I do none of the above, just note
    possibilities here.
*/

/************************************************************
    internal routines to get data from pools
    just fill a 128-bit buffer
*************************************************************/

/*
    just get 128 bits from a pool
    do nothing extra
*/
static void simple_get( struct my_pool *p, u32 *out )
{
    mix_first( p, out ) ;
    mix_last( p, out ) ;
}

/*
    for either output pool we do more
    different for the two pools
*/
static void get_block( u32 *out )
{
    struct my_pool *p ;
    u32 temp[4] ;
    p = &blocking_pool ;

    /*
    mix 128 new bits into our constants
    changes all future output
    */
    simple_get( &input_pool, temp ) ;
    buffer2array( p, temp ) ;

    /*
    reset counter since we have fresh data
    the nonblocking pool can use this
    to determine whether reseeding from here is safe
    */
    p->count = 0 ;

    /*
    we have some fresh data
    it seems wasteful to use it only for one output
    but risky to use it for >1 visible output

    so update the constants arrays for all pools
    stirring our data array in the process
    */
    simple_get( p, temp ) ;
    buffer2array( &input_pool, temp ) ;
    simple_get( p, temp ) ;
    buffer2array( &nonblocking_pool, temp ) ;
    /*
    this pool last
    so actual output uses different constants
    than feedback into other pools

    since this is the 2nd feedback into this
    pool's array, it also guarantees one of
    A or C is changed so all future feedback
    as well as output will be affected
    */
    simple_get( p, temp ) ;
    buffer2array( p, temp ) ;

    // generate output
    simple_get( p, out ) ;
    /*
    exit with p->count == 4 since there are
    4 simple_get() calls after the reset
    */
}

/*
    constants to control external rekeying of
    the non-blocking pool; the values here
    are more-or-less arbitrary first guesses
    as usual for me, I have used some primes

    all pools do self-rekeying (from same pool)
    in mix_first()

    input pool rekeys from external data
    blocking pool rekeys from the input pool
    before every output

    the only place where external rekeying
    (using data from a different pool) needs
    management is here
*/

// how many outputs can we safely take from a seeded pool?
#define SAFE_OUT 331

#define SAFE_BITS (INPUT_POOL_WORDS*24)
#define SOME_BITS 1024
#define K_LO    47
#define K_HI    SAFE_OUT

static void get_nonblock( u32 *out )
{
    struct my_pool *p ;
    u32 temp[4] ;
    p = &nonblocking_pool ;

    /*
    IMPROVE CODE HERE
    this is first-cut general idea stuff

    design goal is to decouple the two
    output pools so that heavy use of
    /dev/urandom cannot starve /dev/random

    blocking pool reseeds whenever it is used
    so it can sometimes provide data to reseed here
    */

    if( (p->count % SAFE_OUT) == 0 )    {

        // try everything reasonable
        if( blocking_pool.count <= K_LO )
            simple_get( &blocking_pool, temp ) ;
        else if( input_pool.entropy_count > SAFE_BITS )
            simple_get( &input_pool, temp ) ;
        else if( blocking_pool.count <= K_HI )
            simple_get( &blocking_pool, temp ) ;
        else if( input_pool.entropy_count > SOME_BITS )
            simple_get( &input_pool, temp ) ;

        // if those all fail, do the best we can
        else    {
            // ADD REAL LOG MESSAGE HERE
            printf( "get_nonblock(), bad reseed\n") ;

            chacha_array( p ) ;
            simple_get( p, temp ) ;
            xor128( temp, counter );

            /*
            set p->count so we will rekey again sooner
            not immediately, give some time for other
            pools to get some entropy
            */
            p->count += ((3*SAFE_OUT)>>2) ;
        }
        buffer2array( p, temp ) ;
    }
    // generate output
    simple_get( p, out ) ;
}

/*****************************************************************
    loop to fill a buffer with output data
    four somewhat different routines

        from input pool
        from blocking pool
        nonblocking to /dev/random
        nonblocking to get_random_bytes()

    perhaps a parent OS providing data to a child VM
    should use a 5th variant?

    all update their constants array first so that
    each call will generate an output stream almost
    independent of previous streams.

    For a rationale, see the Fortuna paper by Schneier et al.
    They are rekeying a counter-mode block cipher, but the
    same principle applies here.

    Some could also mix in extra data
    Doing that here, once per batch of output, is much
    cheaper than per block
    Comments indicate what I think is plausible
*****************************************************************/

static void loop_input( u32 *out, u32 nbytes )
{
    u32 temp[4], n, m ;
    struct my_pool *p ;
    byte *x ;

    p = &input_pool ;
    mix_first( p, temp ) ;

    /*
    ADD CODE HERE?

    timer or hardware rng data could be mixed in
    see earlier comments on those

    consider running addmul() over volatile
     kernel data structures
    page tables? process list? ...?
    This could provide extra entropy.
    */

    mix_last( p, temp ) ;
    buffer2array( p, temp ) ;

    // produce output
    for( n = nbytes, x = (byte *) out ; n != 0 ; n -= m, x += m )    {
        m = (n >= 16) ? 16 : n ;
        simple_get( p, temp ) ;
        memcpy( x, (byte *) temp, m) ;
    }
    zero128( temp ) ;
}

static void loop_block( u32 *out, u32 nbytes )
{
    struct my_pool *p ;
    u32 temp[4], n, m ;
    byte *x ;
    p = &blocking_pool ;

    /*
    no need to add extra data here
    since get_block() does lots of that
    but mix the constants array
    */
    chacha_array( p ) ;

    // produce output
    for( n = nbytes, x = (byte *) out ; n != 0 ; n -= m, x += m )    {
        m = (n >= 16) ? 16 : n ;
        get_block( temp ) ;
        memcpy( x, (byte *) temp, m) ;
    }
    zero128( temp ) ;
}

static void loop_urandom( u32 *out, u32 nbytes )
{
    struct my_pool *p ;
    u32 temp[4], n, m ;
    byte *x ;

    p = &nonblocking_pool ;
    mix_first( p, temp ) ;
    /*
    ADD CODE HERE

    Mix in process info for reading process
    apply addmul() to task_info struct

    This depends on a different aspect of the
    system than anything else in the driver,
    namely the order in which user processes
    ask for data and the current state of those
    processes.

    Except on very simple embedded systems,
    this should be hard to guess. It should be
    impossible to monitor unless the attacker
    is logged into the system or has left a
    background process running on it. Even
    then, monitoring it would not be easy.

    The code should NOT update any entropy
    estimate since we have no clear idea how
    much entropy this gives and it may well
    be zero in some cases.
    */
    mix_last( p, temp ) ;
    buffer2array( p, temp ) ;

    /*
    if code is added above to mix extra stuff into this pool
    then we should propagate the changes to other pools
    if not, this does no harm
    */
    get_nonblock( temp ) ;
    buffer2array( &input_pool, temp ) ;

    // produce output
    for( n = nbytes, x = (byte *) out ; n != 0 ; n -= m, x += m )    {
        m = (n >= 16) ? 16 : n ;
        get_nonblock( temp ) ;
        memcpy( x, (byte *) temp, m) ;
    }
    zero128( temp ) ;
}

static void loop_random_bytes( u32 *out, u32 nbytes )
{
    struct my_pool *p ;
    u32 temp[4], n, m ;
    byte *x ;

    p = &nonblocking_pool ;

    get_nonblock( temp ) ;
    buffer2array( p, temp ) ;

    // produce output
    for( n = nbytes, x = (byte *) out ; n != 0 ; n -= m, x += m )    {
        m = (n >= 16) ? 16 : n ;
        get_nonblock( temp ) ;
        memcpy( x, (byte *) temp, m) ;
    }
    zero128( temp ) ;
}

/*************************************************************
    setup routines, called once at startup
*************************************************************/

/*
    non-random initialisation
    set up pointers
    initialise variables
*/
static void init_static()
{
    u32 *x ;
    struct my_pool *p ;

    input_pool.data = input_pool_data ;
    blocking_pool.data = blocking_pool_data ;
    nonblocking_pool.data = nonblocking_pool_data ;
    input_pool.size = INPUT_POOL_WORDS ;
    blocking_pool.size = nonblocking_pool.size = OUTPUT_POOL_WORDS ;
    input_pool.which = blocking_pool.which = nonblocking_pool.which = 0 ;
    input_pool.count = blocking_pool.count = nonblocking_pool.count = 0 ;
    input_pool.entropy_count = blocking_pool.entropy_count =
nonblocking_pool.entropy_count = 0 ;

    // A,B, C, D are pointers into array of random-per-compile data
    x = init_data ;
    p = &input_pool ;
    p->p = p->data ;
    p->q = p->data + p->size/2;
    p->end = p->data + p->size;
    p->delta = 3 ;
    p->A = x ; x += 4 ;
    p->B = x ; x += 4 ;
    p->C = x ; x += 4 ;
    p->D = x ; x += 4 ;
    // x is not reset between pools
    p = &blocking_pool ;
    p->p = p->data ;
    p->q = p->data + p->size/2;
    p->end = p->data + p->size;
    p->delta = 5 ;
    p->A = x ; x += 4 ;
    p->B = x ; x += 4 ;
    p->C = x ; x += 4 ;
    p->D = x ; x += 4 ;
    p = &nonblocking_pool ;
    p->p = p->data ;
    p->q = p->data + p->size/2;
    p->end = p->data + p->size;
    p->delta = 7 ;
    p->A = x ; x += 4 ;
    p->B = x ; x += 4 ;
    p->C = x ; x += 4 ;
    p->D = x ;
}

#ifdef HAVE_HW_RAND

#include <stdlib.h>

// for testing, emulate the device
static int arch_get_random_long( u32 *x )
{
    *x = (u32) random() ;
    return 1 ;
}

static int hw_init()
{
    int i ;
    u32 *x, t ;

    printf( "hw_init()\n" ) ;

    // update the entire constants array
    for( i = 0, x = init_data ; i < ARRAY_WORDS ; i++, x++ )            {
        if( !arch_get_random_long(&t))
             return 0;
        else    *x ^= t ;
    }
    t = 0 ;
    // the counter
    for( i = 0, x = counter ; i < 4 ; i++, x++ )                    {
        if( !arch_get_random_long(x))
             return 0;
    }
    // and all three pools
    for( i = 0, x = input_pool.data ; i < INPUT_POOL_WORDS ; i++, x++ )        {
        if( !arch_get_random_long(x))
             return 0;
    }
    for( i = 0, x = blocking_pool.data ; i < OUTPUT_POOL_WORDS ; i++,
x++ )        {
        if( !arch_get_random_long(x))
             return 0;
    }
    for( i = 0, x = nonblocking_pool.data ; i < OUTPUT_POOL_WORDS ;
i++, x++ )    {
        if( !arch_get_random_long(x))
             return 0;
    }
    return 1 ;
}
#endif

/*
    introduce random data

    This should not be done until there is
    enough (256 bits?) entropy in the input
    pool.

    This code does not deal with that problem!
    FIX BEFORE USING
*/
static void init_random()
{
    u32 *x, temp[4] ;
    struct my_pool *p ;
    int i ;

    init_static() ;
    printf("static init done\n" ) ;

    i = 0 ;

#ifdef HAVE_HW_RAND
    srandom( init_data[0] ) ;
    i = hw_init() ;
#endif

    /*
    if either no hardware or it failed
    initialise everything from input pool data

    mix_last() and therefore simple_get()
    both stir feedback into input pool
    so this also stirs that pool quite well
    */
    if( i == 0 )    {
        printf( "main branch init_random()\n" ) ;
        p = &input_pool ;
        mix_first( p, temp ) ;

        /*
        ADD CODE HERE

        use addmul() to mix in static info
        things that can act as salt
        need not be unpredictable
        just different on different systems
        e.g. ethernet MAC, other hardware info
        */

        mix_last( p, temp ) ;

        /*
        use that first result to initialise the counter
        this will affect all future outputs
        */
        memcpy( counter, temp, 16 ) ;

        /*
        initialise the output pools with random data
        code requires OUTPUT_POOL_WORDS%4 == 0

        use simple_get() rather than get_input_p()
        saves a bit of overhead
        */
        if( (OUTPUT_POOL_WORDS%4) != 0)
            printf( "OUTPUT_POOL_WORDS mod 4 must be zero, but here it
is %d\n", (OUTPUT_POOL_WORDS%4) ) ;

        for( i = 0, x = blocking_pool.data ; i < OUTPUT_POOL_WORDS ; i
+= 4, x += 4 )    {
            simple_get( &input_pool, temp ) ;
            xor128( x, temp ) ;
        }
        for( i = 0, x = nonblocking_pool.data ; i < OUTPUT_POOL_WORDS
; i += 4, x += 4 ) {
            simple_get( &input_pool, temp ) ;
            xor128( x, temp ) ;
        }
        zero128( temp ) ;
    }
    printf("most of init done\n") ;
    /*
    update the constant arrays for all pools
    this also stirs the data arrays
    */
    stir_array( &input_pool ) ;
    stir_array( &blocking_pool ) ;
    stir_array( &nonblocking_pool ) ;
}

/*************************************************************
    minimal rather dumb test program
*************************************************************/

#define BUFF_WORDS 64
#define BUFF_BYTES (4*BUFF_WORDS)

static void printbuff(u32 * p, int nwords)
{
    int i ;
    for( i = 0 ; i < nwords ; i++ )    {
        printf( "%08x", p[i] ) ;
        if( (i%8) == 7 )    (void) putchar('\n') ;
        else            (void) putchar(' ') ;
    }
    putchar('\n') ;
}

int main( int argc, char **argv)
{
    u32 buffer[BUFF_WORDS] ;
/*
    printf("Array at startup:\n" ) ;
    printbuff( init_data, ARRAY_WORDS ) ;
*/
    init_random() ;
    printf( "back in main() after init()\n" ) ;
/*
    printf("Array after init():\n" ) ;
    printbuff( init_data, ARRAY_WORDS ) ;
*/

    printf( "input pool output\n" ) ;
    loop_input( buffer, BUFF_BYTES) ;
    printbuff( buffer, BUFF_WORDS) ;

    printf( "/dev/random pool output\n" ) ;
    loop_block( buffer, BUFF_BYTES) ;
    printbuff( buffer, BUFF_WORDS) ;

    printf( "/dev/urandom pool output\n" ) ;
    loop_urandom( buffer, BUFF_BYTES) ;
    printbuff( buffer, BUFF_WORDS) ;

    printf( "get_random_bytes() output\n" ) ;
    loop_random_bytes( buffer, BUFF_BYTES) ;
    printbuff( buffer, BUFF_WORDS) ;

    return 0 ;
}