/* ----------------------------------------------------------------- Software implementation of a 1024-bit hash. A candidate for being a candidate for the NIST SHA-3 cryptographic hash competition. Bits are arranged in an 8x8 grid of 16-bit chunks. A round (perm()) consists of applying a 16-bit permutation to each chunk, then rotating each of the 16 bit positions by different amounts through the 64 chunks. The rotates are chosen so 2 bits each are shifted up by 0..7 and left by 0..7. The nonlinear mix of 16-bit chunks is actually an 8-bit permutation applied to the even bits and again to the odd bits, with the 16 bits shuffled at the end. This is 5 NAND-gates deep. 6 of the 16 bits implement a^b^c^d, and can be inverted and still be implemented 5 NAND-gates deep. Symmetry is broken by inverting different subsets of these 6 bits in each of the 64 chunks. Symmetry breaking is done in software by XORing constants. The 64 16-bit permutations are actually 128 8-bit permutations. All 128 8-bit permutations could be done at once with SSE instructions, but I'm only using 64-bit registers here. It takes 8 rounds forward or 7 rounds backwards for all bits to be affected by all one-bit deltas for nearly-zero states. (7 forward or 6 backwards for random states.) I'll take a leap of faith and say that twice forward + backward avalanche, 30 rounds that is, is enough to thwart cryptanalysis, providing there are no self-referential differentials. This is done by combining a block every 3 rounds. Each block is combined four times, at offsets i, i+12-5(i%3), i+28-5((i+2)%3), and i+31. A round is applied to each input before the last two combines, to muddy linear algebra on how blocks are combined. Brute-force search said that all sets of 8 variables or less require at least 30 rounds of differentials that way. (by Bob Jenkins, July 27 2008, public domain) ----------------------------------------------------------------- */ #include #include #include #include typedef unsigned long u4; typedef unsigned long long u8; #define LEN 16 /* number of 8-byte values per block */ #define ROWS 8 #define COLS 8 #define ROUNDS 20000 /* apply the 8-bit permutation to the even bits */ static void eight( u8 *x, u8 *y) { u8 a0,a1,a2,a3,a4,a5,a6,a7,b8,b9,b10,b11,b12,b13,b14,b15; a1 = x[2]; a7 = x[14]; b8 = a1 ^ a7; a0 = x[0]; a3 = x[6]; a5 = x[10]; y[8] = ((b8 & ((a0 & ~a5) | (a3 & ~a0))) | ~(b8 | (a0 ^ a5))); b9 = a3 ^ a5; b10 = b9 ^ a0; y[2] = b10 ^ a1; a2 = x[4]; y[14] = b10 ^ a2; a4 = x[8]; b11 = a2 ^ a4; y[6] = b9 ^ b11; a6 = x[12]; b12 = a1 & a6; b13 = a1 ^ a6; b14 = a0 ^ a4; y[4] = ((b14 & b13) | ~(b14 | ((a1 | a7) & ~b12))); y[12] = ((b13 & ((a2 & ~a4) | (a7 & ~(a1 & a4)))) | ~(b13 | ((a2 | a7) & ~(a2 & a4)))); b15 = a1 & a7; y[10] = ((b11 & b8) | (~b11 & (b15 | (a6 & ~a7)))); y[0] = ((b13 & ~(~(a2 & ~a7) & (b15 | a4))) | (~(~b12 & (a1 | (a4 & a6))) & ((a2 & a7) | (a4 & ~a2)))); } static void eightSSE(__m128i *x, __m128i *y) { __m128i a0,a1,a2,a3,a4,a5,a6,a7,b9,b10; a0 = _mm_load_si128((void *)&x[0]); a1 = _mm_load_si128((void *)&x[1]); a2 = _mm_load_si128((void *)&x[2]); a3 = _mm_load_si128((void *)&x[3]); a4 = _mm_load_si128((void *)&x[4]); a5 = _mm_load_si128((void *)&x[5]); a6 = _mm_load_si128((void *)&x[6]); a7 = _mm_load_si128((void *)&x[7]); /* y[2] */ b9 = _mm_andnot_si128(a1, _mm_andnot_si128(a7, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff))); b10 = _mm_andnot_si128(_mm_xor_si128(a0,a4), _mm_or_si128(b9, _mm_and_si128(a1, a6))); _mm_storeu_si128((void *)&y[2], _mm_or_si128(_mm_and_si128(_mm_xor_si128(a0,a4), _mm_xor_si128(a1, a6)), b10)); /* y[6] */ b9 = _mm_andnot_si128(_mm_or_si128(_mm_xor_si128(a1, a6), _mm_andnot_si128(_mm_and_si128(a2,a4), _mm_or_si128(a2,a7))), _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff)); b10 = _mm_and_si128(_mm_xor_si128(a1, a6), _mm_or_si128(_mm_andnot_si128(a4,a2), _mm_andnot_si128(_mm_and_si128(a1,a4), a7))); _mm_storeu_si128((void *)&y[6], _mm_or_si128(b9, b10)); /* y[0] */ b9 = _mm_andnot_si128(_mm_andnot_si128(_mm_andnot_si128(a7,a2), _mm_or_si128(_mm_and_si128(a1,a7), a4)), _mm_xor_si128(a1, a6)); b10 = _mm_andnot_si128(_mm_andnot_si128(_mm_and_si128(a1, a6), _mm_or_si128(a1, _mm_and_si128(a4,a6))), _mm_or_si128(_mm_and_si128(a2,a7), _mm_andnot_si128(a2,a4))); _mm_storeu_si128((void *)&y[0], _mm_or_si128(b9,b10)); /* y[5] */ b9 = _mm_andnot_si128(_mm_xor_si128(a2,a4), _mm_or_si128(_mm_and_si128(a1,a7), _mm_andnot_si128(a7,a6))); _mm_storeu_si128((void *)&y[5], _mm_or_si128(_mm_and_si128(_mm_xor_si128(a2,a4), _mm_xor_si128(a1,a7)), b9)); /* y[4] */ b9 = _mm_and_si128(_mm_xor_si128(a1,a7), _mm_or_si128(_mm_andnot_si128(a5,a0), _mm_andnot_si128(a0,a3))); b10 = _mm_andnot_si128(_mm_or_si128(_mm_xor_si128(a1,a7), _mm_xor_si128(a0,a5)), _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff)); _mm_storeu_si128((void *)&y[4], _mm_or_si128(b9, b10)); /* y[1] */ _mm_storeu_si128((void *)&y[1], _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(a3, a5), a0), _mm_xor_si128(a1, _mm_set_epi32(0x82ed31eb, 0x138e02ef, 0x18f8aa72, 0x369b75c2)))); /* y[3] */ _mm_storeu_si128((void *)&y[3], _mm_xor_si128(_mm_xor_si128(a2,a4), _mm_xor_si128(_mm_xor_si128(a3, a5), _mm_set_epi32(0x5fe101ed, 0x66fc3130, 0x337b824a, 0xab77201f)))); /* y[7] */ b9 = _mm_xor_si128(_mm_set_epi32(0x1019906d, 0xca58dffb, 0x60bd5131, 0x5e37b49c), _mm_xor_si128(_mm_xor_si128(a3, a5), a0)); _mm_storeu_si128((void *)&y[7], _mm_xor_si128(a2, b9)); } static int shift[LEN] = {56,38,43,1,22,48,41,35,15,13,52,2,28,31,18,61}; static int map[LEN] = {4,0,6,10,8,2,14,1,3,12,9,11,13,5,15,7}; /* one permutation of the internal state */ static void perm(u8 *x) { int i; __m128i y2[LEN/2]; u8 *y = (u8 *)y2; /* do 128 8-bit permutations */ eightSSE((__m128i *)x, (__m128i *)y); /* shuffle and rotate the 16 output bits among the 64 chunks */ x[ 4] = (y[0] << 22) | (y[0] >> 42); x[ 0] = (y[1] << 56) | (y[1] >> 8); x[ 6] = (y[2] << 41) | (y[2] >> 23); x[10] = (y[3] << 52) | (y[3] >> 12); x[ 8] = (y[4] << 15) | (y[4] >> 49); x[ 2] = (y[5] << 43) | (y[5] >> 21); x[14] = (y[6] << 18) | (y[6] >> 46); x[ 1] = (y[7] << 38) | (y[7] >> 26); x[ 3] = (y[8] << 1) | (y[8] >> 63); x[12] = (y[9] << 28) | (y[9] >> 36); x[ 9] = (y[10]<< 13) | (y[10]>> 51); x[11] = (y[11]<< 2) | (y[11]>> 62); x[13] = (y[12]<< 31) | (y[12]>> 33); x[ 5] = (y[13]<< 48) | (y[13]>> 16); x[15] = (y[14]<< 61) | (y[14]>> 3); x[ 7] = (y[15]<< 35) | (y[15]>> 29); } static int unperm[256]; /* the inverse of eight(x,y) */ static void uneight(u8 *y, u8 *x) { int i, j; for (i=0; i> shift[i]) ^ (x[i] << (64-shift[i])); } /* unshuffle the output bits */ for (i=0; i>(32-k))) u4 ranval( ranctx *x ) { u4 e = x->a - rot(x->b, 27); x->a = x->b ^ rot(x->c, 17); x->b = x->c + x->d; x->c = x->d + e; x->d = e + x->a; return e; } void show(u8 x[LEN]) { int i,j,k; for (i=0; i=4; ) { printf(" %s", (x[k] & (((u8)1)<<(COLS*i+j))) ? "1" : "0"); } } printf("\n"); for (j=0; j=12; ) { printf(" %s", (x[k] & (((u8)1)<<(COLS*i+j))) ? "1" : "0"); } } printf("\n"); } printf("\n"); } void showx(float x, float *worst, int verbose) { x = x/ROUNDS; if (x < *worst) *worst = x; if (verbose) printf("%3d", (int)(100*x + 0.5)); if (1.0-x < *worst) *worst = 1.0-x; } float showc(float c[ROWS][COLS][LEN], int verbose) { int i,j,k; float worst = 1.0; for (i=0; i=4; ) { showx(c[i][j][k], &worst, verbose); } if (verbose) printf(" "); } if (verbose) printf("\n"); for (j=0; j=12; ) { showx(c[i][j][k], &worst, verbose); } if (verbose) printf(" "); } if (verbose) printf("\n\n"); } /* printf(" worst = %f\n", worst); */ return worst; } void testsse() { int i, j, k; __m128i x2[LEN/2]; __m128i y2[LEN/2]; u8 *x = (u8 *)x2; u8 *y = (u8 *)y2; for (j=0; j<256; ++j) { for (i=0; i