Radical hash rate optimization

This commit is contained in:
Michael Toutonghi
2018-06-26 07:39:50 -07:00
parent 1a0fc30896
commit 4dcb64c081
15 changed files with 271 additions and 151 deletions

View File

@@ -28,6 +28,7 @@ Optimized Implementations for Haraka256 and Haraka512
#include "crypto/haraka.h"
u128 rc[40];
u128 rc0[40] = {0};
void load_constants() {
rc[0] = _mm_set_epi32(0x0684704c,0xe620c00a,0xb2c5fef0,0x75817b9d);
@@ -365,6 +366,37 @@ void haraka512(unsigned char *out, const unsigned char *in) {
TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
}
void haraka512_zero(unsigned char *out, const unsigned char *in) {
u128 s[4], tmp;
s[0] = LOAD(in);
s[1] = LOAD(in + 16);
s[2] = LOAD(in + 32);
s[3] = LOAD(in + 48);
AES4_zero(s[0], s[1], s[2], s[3], 0);
MIX4(s[0], s[1], s[2], s[3]);
AES4_zero(s[0], s[1], s[2], s[3], 8);
MIX4(s[0], s[1], s[2], s[3]);
AES4_zero(s[0], s[1], s[2], s[3], 16);
MIX4(s[0], s[1], s[2], s[3]);
AES4_zero(s[0], s[1], s[2], s[3], 24);
MIX4(s[0], s[1], s[2], s[3]);
AES4_zero(s[0], s[1], s[2], s[3], 32);
MIX4(s[0], s[1], s[2], s[3]);
s[0] = _mm_xor_si128(s[0], LOAD(in));
s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
}
void haraka512_4x(unsigned char *out, const unsigned char *in) {
u128 s[4][4], tmp;

View File

@@ -68,6 +68,16 @@ extern u128 rc[40];
s2 = _mm_aesenc_si128(s2, rc[rci + 6]); \
s3 = _mm_aesenc_si128(s3, rc[rci + 7]); \
#define AES4_zero(s0, s1, s2, s3, rci) \
s0 = _mm_aesenc_si128(s0, rc0[rci]); \
s1 = _mm_aesenc_si128(s1, rc0[rci + 1]); \
s2 = _mm_aesenc_si128(s2, rc0[rci + 2]); \
s3 = _mm_aesenc_si128(s3, rc0[rci + 3]); \
s0 = _mm_aesenc_si128(s0, rc0[rci + 4]); \
s1 = _mm_aesenc_si128(s1, rc0[rci + 5]); \
s2 = _mm_aesenc_si128(s2, rc0[rci + 6]); \
s3 = _mm_aesenc_si128(s3, rc0[rci + 7]); \
#define AES4_4x(s0, s1, s2, s3, rci) \
AES4(s0[0], s0[1], s0[2], s0[3], rci); \
AES4(s1[0], s1[1], s1[2], s1[3], rci); \
@@ -109,6 +119,7 @@ void haraka256_4x(unsigned char *out, const unsigned char *in);
void haraka256_8x(unsigned char *out, const unsigned char *in);
void haraka512(unsigned char *out, const unsigned char *in);
void haraka512_zero(unsigned char *out, const unsigned char *in);
void haraka512_4x(unsigned char *out, const unsigned char *in);
void haraka512_8x(unsigned char *out, const unsigned char *in);

View File

@@ -53,6 +53,7 @@ static const unsigned char haraka_rc[40][16] = {
};
static unsigned char rc[40][16];
static unsigned char rc0[40][16];
static unsigned char rc_sseed[40][16];
static const unsigned char sbox[256] =
@@ -121,6 +122,12 @@ void unpackhi32(unsigned char *t, unsigned char *a, unsigned char *b)
memcpy(t, tmp, 16);
}
void load_constants_port()
{
/* Use the standard constants to generate tweaked ones. */
memcpy(rc, haraka_rc, 40*16);
}
void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed,
unsigned long long seed_length)
{
@@ -258,6 +265,58 @@ void haraka512_port(unsigned char *out, const unsigned char *in)
memcpy(out + 24, buf + 48, 8);
}
void haraka512_perm_zero(unsigned char *out, const unsigned char *in)
{
int i, j;
unsigned char s[64], tmp[16];
memcpy(s, in, 16);
memcpy(s + 16, in + 16, 16);
memcpy(s + 32, in + 32, 16);
memcpy(s + 48, in + 48, 16);
for (i = 0; i < 5; ++i) {
// aes round(s)
for (j = 0; j < 2; ++j) {
aesenc(s, rc0[4*2*i + 4*j]);
aesenc(s + 16, rc0[4*2*i + 4*j + 1]);
aesenc(s + 32, rc0[4*2*i + 4*j + 2]);
aesenc(s + 48, rc0[4*2*i + 4*j + 3]);
}
// mixing
unpacklo32(tmp, s, s + 16);
unpackhi32(s, s, s + 16);
unpacklo32(s + 16, s + 32, s + 48);
unpackhi32(s + 32, s + 32, s + 48);
unpacklo32(s + 48, s, s + 32);
unpackhi32(s, s, s + 32);
unpackhi32(s + 32, s + 16, tmp);
unpacklo32(s + 16, s + 16, tmp);
}
memcpy(out, s, 64);
}
void haraka512_port_zero(unsigned char *out, const unsigned char *in)
{
int i;
unsigned char buf[64];
haraka512_perm_zero(buf, in);
/* Feed-forward */
for (i = 0; i < 64; i++) {
buf[i] = buf[i] ^ in[i];
}
/* Truncated */
memcpy(out, buf + 8, 8);
memcpy(out + 8, buf + 24, 8);
memcpy(out + 16, buf + 32, 8);
memcpy(out + 24, buf + 48, 8);
}
void haraka256_port(unsigned char *out, const unsigned char *in)
{

View File

@@ -1,6 +1,9 @@
#ifndef SPX_HARAKA_H
#define SPX_HARAKA_H
/* load constants */
void load_constants_port();
/* Tweak constants with seed */
void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed,
unsigned long long seed_length);
@@ -15,6 +18,12 @@ void haraka512_perm(unsigned char *out, const unsigned char *in);
/* Implementation of Haraka-512 */
void haraka512_port(unsigned char *out, const unsigned char *in);
/* Applies the 512-bit Haraka permutation to in, using zero key. */
void haraka512_perm_zero(unsigned char *out, const unsigned char *in);
/* Implementation of Haraka-512, using zero key */
void haraka512_port_zero(unsigned char *out, const unsigned char *in);
/* Implementation of Haraka-256 */
void haraka256_port(unsigned char *out, const unsigned char *in);

View File

@@ -12,6 +12,8 @@ bit output.
#include "crypto/common.h"
#include "crypto/verus_hash.h"
void (*CVerusHash::haraka512Function)(unsigned char *out, const unsigned char *in);
void CVerusHash::Hash(void *result, const void *data, size_t len)
{
unsigned char buf[128];
@@ -36,7 +38,7 @@ void CVerusHash::Hash(void *result, const void *data, size_t len)
memcpy(bufPtr + 32, ptr + pos, i);
memset(bufPtr + 32 + i, 0, 32 - i);
}
haraka512(bufPtr2, bufPtr);
(*haraka512Function)(bufPtr2, bufPtr);
bufPtr2 = bufPtr;
bufPtr += nextOffset;
nextOffset *= -1;
@@ -44,6 +46,18 @@ void CVerusHash::Hash(void *result, const void *data, size_t len)
memcpy(result, bufPtr, 32);
};
void CVerusHash::init()
{
if (IsCPUVerusOptimized())
{
haraka512Function = &haraka512_zero;
}
else
{
haraka512Function = &haraka512_port_zero;
}
}
CVerusHash &CVerusHash::Write(const unsigned char *data, size_t len)
{
unsigned char *tmp;
@@ -56,7 +70,7 @@ CVerusHash &CVerusHash::Write(const unsigned char *data, size_t len)
if (len - pos >= room)
{
memcpy(curBuf + 32 + curPos, data + pos, room);
haraka512(result, curBuf);
(*haraka512Function)(result, curBuf);
tmp = curBuf;
curBuf = result;
result = tmp;
@@ -73,7 +87,31 @@ CVerusHash &CVerusHash::Write(const unsigned char *data, size_t len)
return *this;
}
void CVerusHashPortable::Hash(void *result, const void *data, size_t len)
// to be declared and accessed from C
void verus_hash(void *result, const void *data, size_t len)
{
return CVerusHash::Hash(result, data, len);
}
void (*CVerusHashV2::haraka512Function)(unsigned char *out, const unsigned char *in);
void CVerusHashV2::init()
{
// load and tweak the haraka constants
load_constants();
load_constants_port();
if (IsCPUVerusOptimized())
{
haraka512Function = &haraka512;
}
else
{
haraka512Function = &haraka512_port;
}
}
void CVerusHashV2::Hash(void *result, const void *data, size_t len)
{
unsigned char buf[128];
unsigned char *bufPtr = buf;
@@ -97,7 +135,7 @@ void CVerusHashPortable::Hash(void *result, const void *data, size_t len)
memcpy(bufPtr + 32, ptr + pos, i);
memset(bufPtr + 32 + i, 0, 32 - i);
}
haraka512_port(bufPtr2, bufPtr);
(*haraka512Function)(bufPtr2, bufPtr);
bufPtr2 = bufPtr;
bufPtr += nextOffset;
nextOffset *= -1;
@@ -105,7 +143,7 @@ void CVerusHashPortable::Hash(void *result, const void *data, size_t len)
memcpy(result, bufPtr, 32);
};
CVerusHashPortable &CVerusHashPortable::Write(const unsigned char *data, size_t len)
CVerusHashV2 &CVerusHashV2::Write(const unsigned char *data, size_t len)
{
unsigned char *tmp;
@@ -117,7 +155,7 @@ CVerusHashPortable &CVerusHashPortable::Write(const unsigned char *data, size_t
if (len - pos >= room)
{
memcpy(curBuf + 32 + curPos, data + pos, room);
haraka512_port(result, curBuf);
(*haraka512Function)(result, curBuf);
tmp = curBuf;
curBuf = result;
result = tmp;
@@ -135,14 +173,7 @@ CVerusHashPortable &CVerusHashPortable::Write(const unsigned char *data, size_t
}
// to be declared and accessed from C
void verus_hash(void *result, const void *data, size_t len)
void verus_hash_v2(void *result, const void *data, size_t len)
{
return CVerusHashPortable::Hash(result, data, len);
return CVerusHashV2::Hash(result, data, len);
}
// to be declared and accessed from C
void verus_hash_optimized(void *result, const void *data, size_t len)
{
return CVerusHash::Hash(result, data, len);
}

View File

@@ -23,6 +23,9 @@ class CVerusHash
{
public:
static void Hash(void *result, const void *data, size_t len);
static void (*haraka512Function)(unsigned char *out, const unsigned char *in);
static void init();
CVerusHash() {}
@@ -36,12 +39,22 @@ class CVerusHash
std::fill(buf1, buf1 + sizeof(buf1), 0);
}
int64_t *ExtraI64Ptr() { return (int64_t *)(curBuf + 32); }
void ClearExtra()
{
if (curPos)
{
std::fill(curBuf + 32 + curPos, curBuf + 64, 0);
}
}
void ExtraHash(unsigned char hash[32]) { (*haraka512Function)(hash, curBuf); }
void Finalize(unsigned char hash[32])
{
if (curPos)
{
std::fill(curBuf + 32 + curPos, curBuf + 64, 0);
haraka512(hash, curBuf);
(*haraka512Function)(hash, curBuf);
}
else
std::memcpy(hash, curBuf, 32);
@@ -54,16 +67,19 @@ class CVerusHash
size_t curPos = 0;
};
class CVerusHashPortable
class CVerusHashV2
{
public:
static void Hash(void *result, const void *data, size_t len);
static void (*haraka512Function)(unsigned char *out, const unsigned char *in);
CVerusHashPortable() {}
static void init();
CVerusHashPortable &Write(const unsigned char *data, size_t len);
CVerusHashV2() {}
CVerusHashPortable &Reset()
CVerusHashV2 &Write(const unsigned char *data, size_t len);
CVerusHashV2 &Reset()
{
curBuf = buf1;
result = buf2;
@@ -71,12 +87,22 @@ class CVerusHashPortable
std::fill(buf1, buf1 + sizeof(buf1), 0);
}
int64_t *ExtraI64Ptr() { return (int64_t *)(curBuf + 32); }
void ClearExtra()
{
if (curPos)
{
std::fill(curBuf + 32 + curPos, curBuf + 64, 0);
}
}
void ExtraHash(unsigned char hash[32]) { (*haraka512Function)(hash, curBuf); }
void Finalize(unsigned char hash[32])
{
if (curPos)
{
std::fill(curBuf + 32 + curPos, curBuf + 64, 0);
haraka512_port(hash, curBuf);
(*haraka512Function)(hash, curBuf);
}
else
std::memcpy(hash, curBuf, 32);
@@ -90,6 +116,7 @@ class CVerusHashPortable
};
extern void verus_hash(void *result, const void *data, size_t len);
extern void verus_hash_v2(void *result, const void *data, size_t len);
inline bool IsCPUVerusOptimized()
{