/*
-----------------------------------------------------------------
Software simulation of a 1024-bit hardware hash.  A work-in-progress
with the goal of entering the NIST cryptographic hash competition.

Bits are arranged in an 8x8 grid of 16-bit blocks.  A round consists of
64 16-bit permutations followed by rotating each of the 16 bit positions
by different amounts through the 64 blocks.  The rotates are chosen so
2 bits are shifted up by 0..7 and left by 0..7.

The nonlinear mix of 16-bit blocks is actually an 8-bit permutation 
applied to bits 0..7 and again to 8..15, with the 16 bits shuffled at
the end.  The mix is 5 NAND-gates deep.  Certain bits can be inverted
and still be implemented 5 NAND-gates deep.  Symmetry is broken by
inverting different subsets of these bits in each permutation. 
Symmetry breaking is done by XORing 64-bit constants to the invertible 
bits.

All 128 8-bit permutations could be done at once with SSE instructions, 
but I'm only using 64-bit registers here.  This isn't optimized much.
It takes 7 rounds forward or 7 rounds backwards for all bits to be 
affected.  

Is this better or worse than my previous Feistel-network based 16-bit
permutations?  At the moment, I'd say worse.  But I haven't measured
which function is best, or tuned to shift or XOR constants yet.

(by Bob Jenkins, July 24 2008, public domain)
-----------------------------------------------------------------
*/

#include <stdio.h>
#include <stddef.h>

typedef unsigned long  u4;
typedef unsigned long long u8;

#define LEN 16

#define ROWS 8
#define COLS 8

#define ROUNDS 200

/* do one 8-bit permutation */
void eight( u8 *x, u8 *y)
{
  u8 a0 = x[0];
  u8 a1 = x[1];
  u8 a2 = x[2];
  u8 a3 = x[3];
  u8 a4 = x[4];
  u8 a5 = x[5];
  u8 a6 = x[6];
  u8 a7 = x[7];
  u8 a8 = a3 ^ a6;
  u8 a9 = ~a7;
  u8 a10 = a5 & a9;
  u8 a11 = a4 | a5;
  u8 a12 = ~a11;
  u8 a13 = a10 | a12;
  u8 a14 = a8 & a13;
  u8 a15 = a5 ^ a7;
  u8 a16 = a8 | a15;
  u8 a17 = ~a16;
  y[0] = a14 | a17;
  u8 a19 = a0 ^ a4;
  u8 a20 = ~a15;
  u8 a21 = a19 & a20;
  u8 a22 = ~a6;
  u8 a23 = a4 & a22;
  u8 a24 = a0 | a4;
  u8 a25 = ~a24;
  u8 a26 = a23 | a25;
  u8 a27 = a26 & a15;
  y[1] = a21 | a27;
  u8 a29 = a0 ^ a6;
  u8 a30 = a1 ^ a2;
  y[2] = a29 ^ a30;
  u8 a32 = a0 ^ a3;
  u8 a33 = a4 & a9;
  u8 a34 = a3 & a7;
  u8 a35 = ~a34;
  u8 a36 = a5 & a35;
  u8 a37 = a33 | a36;
  u8 a38 = a32 & a37;
  u8 a39 = a4 & a5;
  u8 a40 = a39 | a9;
  u8 a41 = a40 & a11;
  u8 a42 = a32 | a41;
  u8 a43 = ~a42;
  y[3] = a38 | a43;
  u8 a45 = a1 & a4;
  u8 a46 = ~a4;
  u8 a47 = a2 & a46;
  u8 a48 = a45 | a47;
  u8 a49 = a48 & a20;
  u8 a50 = ~a5;
  u8 a51 = a4 & a50;
  u8 a52 = a2 | a5;
  u8 a53 = a52 & a7;
  u8 a54 = ~a53;
  u8 a55 = a51 | a54;
  u8 a56 = ~a1;
  u8 a57 = a5 & a56;
  u8 a58 = ~a45;
  u8 a59 = a7 & a58;
  u8 a60 = a57 | a59;
  u8 a61 = a55 & a60;
  y[4] = a49 | a61;
  y[5] = a32 ^ a30;
  u8 a64 = a1 ^ a6;
  u8 a65 = a2 ^ a3;
  y[6] = a64 ^ a65;
  u8 a67 = a7 & a46;
  u8 a68 = a39 | a67;
  u8 a69 = a29 & a68;
  u8 a70 = a5 | a46;
  u8 a71 = a4 | a7;
  u8 a72 = a70 & a71;
  u8 a73 = a29 | a72;
  u8 a74 = ~a73;
  y[7] = a69 | a74;
}

static int shift[LEN] = {56,43,22,41,15,52,28,18,38,1,48,35,13,2,31,61};
static u4 map[] = {2,3,4,7,9,12,14,15,0,5,1,8,6,13,10,11};

void perm(u8 x[LEN])
{
  u8  y[LEN];
  int i;

  /* the two 8-bit permutations */
  eight(x, y);
  eight(&x[LEN/2], &y[LEN/2]);

  /* break symmetry among the 64 16-bit permutations */
  y[2] ^= 0x18f8aa72369b75c2LL;
  y[5] ^= 0x337b824aab77201fLL;
  y[6] ^= 0x60bd51315e37b49cLL;
  y[10] ^= 0x82ed31eb138e02efLL;
  y[13] ^= 0x5fe101ed66fc3130LL;
  y[14] ^= 0x1019906dca58dffbLL;

  /* shuffle the output bits */
  for (i=0; i<LEN; ++i) {
    x[map[i]] = y[i];
  }

  /* rotate the output bits */
  for (i=0; i<LEN; ++i) {
    x[i] = (x[i] << shift[i]) ^ (x[i] >> (64-shift[i]));
  }
}

static int unperm[256];

void uneight(u8 *y, u8 *x)
{
  int i, j;
  for (i=0; i<8; ++i) {
    x[i] = 0;
  }
  for (i=0; i<64; ++i) {
    int z = 0;
    for (j=0; j<8; ++j) {
      if (y[j] & (((u8)1)<<i)) {
	z ^= (1<<j);
      }
    }
    z = unperm[z];
    for (j=0; j<8; ++j) {
      if (z & (1<<j)) {
	x[j] ^= (((u8)1)<<i);
      }
    }
  }
}

/* do the inverse of one round of the hash */
void iperm(u8 x[LEN])
{
  int i;
  u8 y[LEN];

  /* unrotate the output bits */
  for (i=0; i<LEN; ++i) {
    x[i] = (x[i] >> shift[i]) ^ (x[i] << (64-shift[i]));
  }

  /* unshuffle the output bits */
  for (i=0; i<LEN; ++i) {
    y[i] = x[map[i]];
  }

  /* break symmetry among the 64 16-bit permutations */
  y[2] ^= 0x18f8aa72369b75c2LL;
  y[5] ^= 0x337b824aab77201fLL;
  y[6] ^= 0x60bd51315e37b49cLL;
  y[10] ^= 0x82ed31eb138e02efLL;
  y[13] ^= 0x5fe101ed66fc3130LL;
  y[14] ^= 0x1019906dca58dffbLL;

  /* revert the two 8-bit permutations */
  uneight(y, x);
  uneight(&y[LEN/2], &x[LEN/2]);
}


/* a random number generator, for testing, not part of the hash */
typedef struct ranctx { u4 a; u4 b; u4 c; u4 d; } ranctx;

#define rot(x,k) ((x<<k)|(x>>(32-k)))
u4 ranval( ranctx *x ) {
  u4 e = x->a - rot(x->b, 27);
  x->a = x->b ^ rot(x->c, 17);
  x->b = x->c + x->d;
  x->c = x->d + e;
  x->d = e + x->a;
  return e;
}

void show(u8 x[LEN])
{
  int i,j,k;
  for (i=0; i<ROWS; ++i) {
    for (j=0; j<COLS; ++j) {
      for (k=0; k<4; ++k) {
	printf(" %s", (x[k] & (((u8)1)<<(COLS*i+j))) ? "1" : "0");
      }
    }
    printf("\n");
    for (j=0; j<COLS; ++j) {
      for (k=8; --k>=4; ) {
	printf(" %s", (x[k] & (((u8)1)<<(COLS*i+j))) ? "1" : "0");
      }
    }
    printf("\n");
    for (j=0; j<COLS; ++j) {
      for (k=8; k<12; ++k) {
	printf(" %s", (x[k] & (((u8)1)<<(COLS*i+j))) ? "1" : "0");
      }
    }
    printf("\n");
    for (j=0; j<COLS; ++j) {
      for (k=16; --k>=12; ) {
	printf(" %s", (x[k] & (((u8)1)<<(COLS*i+j))) ? "1" : "0");
      }
    }
    printf("\n");
  }
  printf("\n");
}

void showx(float x, float *worst, int verbose)
{
  x = x/ROUNDS;
  if (x < *worst) *worst = x;
  if (verbose) printf("%3d", (int)(100*x + 0.5));
  if (1.0-x < *worst) *worst = 1.0-x;
}

float showc(float c[ROWS][COLS][LEN], int verbose)
{
  int i,j,k;
  float worst = 1.0;
  for (i=0; i<ROWS; ++i) {
    for (j=0; j<COLS; ++j) {
      for (k=0; k<4; ++k) {
	showx(c[i][j][k], &worst, verbose);
      }
      if (verbose) printf(" ");
    }
    if (verbose) printf("\n");
    for (j=0; j<COLS; ++j) {
      for (k=8; --k>=4; ) {
	showx(c[i][j][k], &worst, verbose);
      }
      if (verbose) printf(" ");
    }
    if (verbose) printf("\n");
    for (j=0; j<COLS; ++j) {
      for (k=8; k<12; ++k) {
	showx(c[i][j][k], &worst, verbose);
      }
      if (verbose) printf(" ");
    }
    if (verbose) printf("\n");
    for (j=0; j<COLS; ++j) {
      for (k=16; --k>=12; ) {
	showx(c[i][j][k], &worst, verbose);
      }
      if (verbose) printf(" ");
    }
    if (verbose) printf("\n\n");
  }
  /* printf("  worst = %f\n", worst); */
  return worst;
}

void test(ranctx *rctx)
{
  u8 x[LEN], y[LEN];
  u4 m,n,o,p,count,i,j,k;
  float val, worst = 1.0;
  float zcount = 0.0;

  /* see how thoroughly bit "tab[m][n] & (1<<o)" propogates */
  for (m=0; m<ROWS; ++m) {
    for (n=0; n<COLS; ++n) {
      for (o=0; o<LEN; ++o) {
	for (p=0; p<2*LEN; p++) {
	  float c[ROWS][COLS][LEN];  /* count of how many times bit flipped */
	  u4 r;  /* count # of rounds */

	  /* if (m != 5 || n != 7 || o != 6 || p != 0) continue; */
	  /* if (m != 0) continue; */

	  /* initialize c */
	  for (i=0; i<ROWS; ++i) {
	    for (j=0; j<COLS; ++j) {
	      for (k=0; k<LEN; ++k) {
		c[i][j][k] = 0;
	      }
	    }
	  }

	  /* try r keys */
	  for (r=0; r<ROUNDS; ++r) {

	    /* initialize to a random key */
	    for (i=0; i<LEN; ++i) {
	      x[i] = y[i] = ((u8)ranval(rctx)) | (((u8)ranval(rctx))<<32);
	      /* x[i] = y[i] = (p/LEN ? ~((u8)0) : ((u8)0)); */
	    }
	    x[p%LEN] ^= ((u8)r)<<0;
	    y[p%LEN] = x[p%LEN];

	    /* start with bit [m][n][o] different */
	    y[o] ^= (((u8)1) << (COLS*m+n));

	    /* apply several rounds of the hash to both keys */
	    perm(x); perm(y);
	    perm(x); perm(y);
	    perm(x); perm(y);

	    perm(x); perm(y);
	    perm(x); perm(y);
	    perm(x); perm(y);

	    perm(x); perm(y);

	    /* measure which bits were affected */
	    for (k=0; k<LEN; ++k) {
	      u8 diff = x[k] ^ y[k];
	      for (i=0; i<ROWS; ++i) {
		for (j=0; j<COLS; ++j) {
		  if (diff & (((u8)1) << (COLS*i+j))) {
		    ++c[i][j][k];
		  }
		}
	      }
	    }
	  }

	  /* report which bits were affected */
	  val = showc(c, 0);
	  if (val == 0) {
	    ++zcount;
	  }
	  if (val < worst) {
	    worst = val;
	    printf("new worst: %f  %d %d %d %d\n", worst, m,n,o,p);
	  }
	}
      }
    }
  }
}

void set_unperm()
{
  u8 x[8];
  u8 y[8];
  int perm[256];
  int i, j;
  for (i=0; i<256; ++i) {
    for (j=0; j<8; ++j) {
      x[j] = (i & (1<<j)) ? 1 : 0;
    }
    eight(x, y);
    perm[i] = 0;
    for (j=0; j<8; ++j) {
      if (y[j] & 1) {
	perm[i] ^= (1<<j);
      }
    }
  }
  for (i=0; i<256; ++i) {
    unperm[perm[i]] = i;
  }
}

/* m fffff m fffff m fffff m fffff m fffff m */
void fffff(u8 *x)
{
  perm(x);
  perm(x);
  perm(x);
  perm(x);
  perm(x);
}

int main()
{
  int i;
  ranctx rctx = {1,1,1,63};
  for (i=0; i<20; ++i) ranval(&rctx);
  set_unperm();

  test(&rctx);
}