/* ----------------------------------------------------------------- Software implementation of a 1024-bit hash. A candidate for being a candidate for the NIST SHA-3 cryptographic hash competition. Bits are arranged in an 8x8 grid of 16-bit chunks. A round (perm()) consists of applying a 16-bit permutation to each chunk, then rotating each of the 16 bit positions by different amounts through the 64 chunks. The rotates are chosen so 2 bits each are shifted up by 0..7 and left by 0..7. The nonlinear mix of 16-bit chunks is actually an 8-bit permutation applied to the even bits and again to the odd bits, with the 16 bits shuffled at the end. This is 5 NAND-gates deep. 6 of the 16 bits implement a^b^c^d, and can be inverted and still be implemented 5 NAND-gates deep. Symmetry is broken by inverting different subsets of these 6 bits in each of the 64 chunks. Symmetry breaking is done in software by XORing constants. The 16-bit permutation has avalanche of 0.375 when applied to itself three times. The 64 16-bit permutations are actually 128 8-bit permutations. All 128 8-bit permutations are done at once with SSE instructions. It takes 8 rounds forward or 7 rounds backwards for all bits to be affected by all one-bit deltas for nearly-zero states (avalanche 0.27 forward, 0.23 backward). (7 rounds forward (avalanche 0.378) or 6 backwards (avalanche 0.19) for random states.) I'm making an ungrounded assumption that twice forward + backward avalanche, 30 rounds that is, is enough to thwart cryptanalysis. Further, I'm assuming it doesn't matter if these are consecutive rounds. This is done by combining a block every 3 rounds. Each block is combined four times, at offsets i (plus two rounds), i+15-6(i%4), i+26-6(i%4), and i+37. The offset by two rounds for the first use guarantees at least one round of differentials is required per blocks. Brute-force search said that all sets of 9 variables or less require at least 30 rounds of differentials that way. (by Bob Jenkins, August 14 2008, public domain) ----------------------------------------------------------------- */ #include #include #include #include typedef unsigned long u4; typedef unsigned long long u8; #define LEN 16 /* number of 8-byte values per block */ #define ROWS 8 #define COLS 8 #define ROUNDS 20000 #define xor(a,b) (a ^ b) #define or(a,b) (a | b) #define and(a,b) (a & b) #define ant(a,b) (~a & b) #define not(a) ~a #define read(x,i) x[i*2] #define write(y,i,a) y[i*2] = a /* apply the 8-bit permutation to the even bits */ static void eight( u8 *x, u8 *y) { u8 q,r; u8 a0 = read(x,0); u8 a1 = read(x,1); u8 a2 = read(x,2); u8 a3 = read(x,3); u8 a4 = read(x,4); u8 a5 = read(x,5); u8 a6 = read(x,6); u8 a7 = read(x,7); q = xor(a4,a5); r = xor(a7,a0); write(y,0,xor(q,r)); q = xor(a5,a1); r = xor(a3,a2); write(y,1,xor(q,r)); q = xor(a3,a4); r = xor(a5,a1); write(y,2,xor(q,r)); q = ant(xor(a0,a3),xor(a6,a4)); r = ant(xor(a4,a6),or(and(a0,a1),ant(a0,a3))); write(y,3,or(q,r)); q = ant(xor(a6,a7),xor(a5,a2)); r = and(or(not(or(a2,a5)),and(a4,a5)),xor(a7,a6)); write(y,4,or(q,r)); q = and(xor(a5,a0),xor(a4,a2)); r = ant(xor(a2,a4),or(and(a0,a5),ant(a0,a1))); write(y,5,or(q,r)); q = and(xor(a4,a2),xor(a7,a6)); r = ant(xor(a2,a4),or(ant(a7,a0),and(a6,a7))); write(y,6,or(q,r)); q = and(or(not(or(a6,a7)),ant(and(a6,a7),a0)),xor(a4,a2)); r = ant(xor(a2,a4),or(ant(and(a0,a2),a6),ant(a0,a7))); write(y,7,or(q,r)); } #undef xor #undef or #undef and #undef ant #undef not #undef read #undef write #define xor(a,b) _mm_xor_si128(a, b) #define or(a,b) _mm_or_si128(a, b) #define and(a,b) _mm_and_si128(a, b) #define ant(a,b) _mm_andnot_si128(a, b) #define not(a) _mm_xor_si128(a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff)) #define read(x,i) _mm_load_si128((void *)&x[i]) #define write(y,i,a) _mm_storeu_si128((void *)&y[i], a) static void eightSSE(__m128i *x, __m128i *y) { __m128i q,r; __m128i a0 = read(x,0); __m128i a1 = read(x,1); __m128i a2 = read(x,2); __m128i a3 = read(x,3); __m128i a4 = read(x,4); __m128i a5 = read(x,5); __m128i a6 = read(x,6); __m128i a7 = read(x,7); q = xor(a4,a5); r = xor(a7,a0); write(y,0,xor(q,r)); q = xor(a5,a1); r = xor(a3,a2); write(y,1,xor(q,r)); q = xor(a3,a4); r = xor(a5,a1); write(y,2,xor(q,r)); q = ant(xor(a0,a3),xor(a6,a4)); r = ant(xor(a4,a6),or(and(a0,a1),ant(a0,a3))); write(y,3,or(q,r)); q = ant(xor(a6,a7),xor(a5,a2)); r = and(or(not(or(a2,a5)),and(a4,a5)),xor(a7,a6)); write(y,4,or(q,r)); q = and(xor(a5,a0),xor(a4,a2)); r = ant(xor(a2,a4),or(and(a0,a5),ant(a0,a1))); write(y,5,or(q,r)); q = and(xor(a4,a2),xor(a7,a6)); r = ant(xor(a2,a4),or(ant(a7,a0),and(a6,a7))); write(y,6,or(q,r)); q = and(or(not(or(a6,a7)),ant(and(a6,a7),a0)),xor(a4,a2)); r = ant(xor(a2,a4),or(ant(and(a0,a2),a6),ant(a0,a7))); write(y,7,or(q,r)); } static int shift[LEN] = {40,60,26,4,0,3,11,15,13,13,1,41,14,7,18,14}; static int map[LEN] = {12,13,4,9,1,14,5,2,10,8,6,7,3,15,0,11}; /* one permutation of the internal state */ static void perm(u8 *x) { int i; __m128i y2[LEN/2]; u8 *y = (u8 *)y2; /* do 128 8-bit permutations */ eightSSE((__m128i *)x, (__m128i *)y); /* break symmetry among the 64 16-bit permutations */ y[0] ^= 0x18f8aa72369b75c2LL; y[1] ^= 0x337b824aab77201fLL; y[2] ^= 0x60bd51315e37b49cLL; y[3] ^= 0x82ed31eb138e02efLL; y[4] ^= 0x5fe101ed66fc3130LL; y[5] ^= 0x1019906dca58dffbLL; /* shuffle and rotate the 16 output bits among the 64 chunks */ x[12] = (y[0] << 40) | (y[0] >> (64-40)); x[13] = (y[1] << 60) | (y[1] >> (64-60)); x[ 4] = (y[2] << 26) | (y[2] >> (64-26)); x[ 9] = (y[3] << 4) | (y[3] >> (64- 4)); x[ 1] = y[4]; x[14] = (y[5] << 3) | (y[5] >> (64- 3)); x[ 5] = (y[6] << 11) | (y[6] >> (64-11)); x[ 2] = (y[7] << 15) | (y[7] >> (64-15)); x[10] = (y[8] << 13) | (y[8] >> (64-13)); x[ 8] = (y[9] << 13) | (y[9] >> (64-13)); x[ 6] = (y[10]<< 1) | (y[10]>> (64- 1)); x[ 7] = (y[11]<< 41) | (y[11]>> (64-41)); x[ 3] = (y[12]<< 14) | (y[12]>> (64-14)); x[15] = (y[13]<< 7) | (y[13]>> (64- 7)); x[ 0] = (y[14]<< 18) | (y[14]>> (64-18)); x[11] = (y[15]<< 14) | (y[15]>> (64-14)); } static int unperm[256]; /* the inverse of eight(x,y) */ static void uneight(u8 *y, u8 *x) { int i, j; for (i=0; i> shift[i]) | (y[i] << (64-shift[i])); } /* break symmetry among the 64 16-bit permutations */ y[0] ^= 0x18f8aa72369b75c2LL; y[1] ^= 0x337b824aab77201fLL; y[2] ^= 0x60bd51315e37b49cLL; y[3] ^= 0x82ed31eb138e02efLL; y[4] ^= 0x5fe101ed66fc3130LL; y[5] ^= 0x1019906dca58dffbLL; /* revert the two 8-bit permutations */ uneight(&y[1], &x[1]); uneight(y, x); } /* a random number generator, for testing, not part of the hash */ typedef struct ranctx { u4 a; u4 b; u4 c; u4 d; } ranctx; #define rot(x,k) ((x<>(32-k))) u4 ranval( ranctx *x ) { u4 e = x->a - rot(x->b, 27); x->a = x->b ^ rot(x->c, 17); x->b = x->c + x->d; x->c = x->d + e; x->d = e + x->a; return e; } void show(u8 x[LEN]) { int i,j,k; for (i=0; i=4; ) { printf(" %s", (x[k] & (((u8)1)<<(COLS*i+j))) ? "1" : "0"); } } printf("\n"); for (j=0; j=12; ) { printf(" %s", (x[k] & (((u8)1)<<(COLS*i+j))) ? "1" : "0"); } } printf("\n"); } printf("\n"); } void showx(float x, float *worst, int verbose) { x = x/ROUNDS; if (x < *worst) *worst = x; if (verbose) printf("%3d", (int)(100*x + 0.5)); if (1.0-x < *worst) *worst = 1.0-x; } float showc(float c[ROWS][COLS][LEN], int verbose) { int i,j,k; float worst = 1.0; for (i=0; i=4; ) { showx(c[i][j][k], &worst, verbose); } if (verbose) printf(" "); } if (verbose) printf("\n"); for (j=0; j=12; ) { showx(c[i][j][k], &worst, verbose); } if (verbose) printf(" "); } if (verbose) printf("\n\n"); } /* printf(" worst = %f\n", worst); */ return worst; } #define TESTONERUNS 100 void testone(ranctx *rctx) { u4 count[LEN]; __m128i x2[LEN/2]; __m128i y2[LEN/2]; u8 *x = (u8 *)x2; u8 *y = (u8 *)y2; u4 i; for (i=0; i<(1< .15*TESTONERUNS && count[j] < .85*TESTONERUNS) break; } if (j == LEN) { printf("%.4x ", i); for (j=0; j>(64-3); one_combine(a, &offset, state, buf); /* apply the remaining uses of the last few blocks */ for (i=0; i