Files
dragonx/src/snark/libsnark/algebra/scalar_multiplication/multiexp.tcc
2018-04-20 03:32:45 +03:00

592 lines
17 KiB
C++

/** @file
*****************************************************************************
Implementation of interfaces for multi-exponentiation routines.
See multiexp.hpp .
*****************************************************************************
* @author This file is part of libsnark, developed by SCIPR Lab
* and contributors (see AUTHORS).
* @copyright MIT license (see LICENSE file)
*****************************************************************************/
#ifndef MULTIEXP_TCC_
#define MULTIEXP_TCC_
#include "algebra/fields/fp_aux.tcc"
#include <algorithm>
#include <cassert>
#include <type_traits>
#include "common/profiling.hpp"
#include "common/utils.hpp"
#include "common/assert_except.hpp"
#include "algebra/scalar_multiplication/wnaf.hpp"
namespace libsnark {
template<mp_size_t n>
class ordered_exponent {
// to use std::push_heap and friends later
public:
size_t idx;
bigint<n> r;
ordered_exponent(const size_t idx, const bigint<n> &r) : idx(idx), r(r) {};
bool operator<(const ordered_exponent<n> &other) const
{
#if defined(__x86_64__) && defined(USE_ASM)
if (n == 3)
{
int64_t res;
__asm__
("// check for overflow \n\t"
"mov $0, %[res] \n\t"
ADD_CMP(16)
ADD_CMP(8)
ADD_CMP(0)
"jmp done%= \n\t"
"subtract%=: \n\t"
"mov $1, %[res] \n\t"
"done%=: \n\t"
: [res] "=&r" (res)
: [A] "r" (other.r.data), [mod] "r" (this->r.data)
: "cc", "%rax");
return res;
}
else if (n == 4)
{
int64_t res;
__asm__
("// check for overflow \n\t"
"mov $0, %[res] \n\t"
ADD_CMP(24)
ADD_CMP(16)
ADD_CMP(8)
ADD_CMP(0)
"jmp done%= \n\t"
"subtract%=: \n\t"
"mov $1, %[res] \n\t"
"done%=: \n\t"
: [res] "=&r" (res)
: [A] "r" (other.r.data), [mod] "r" (this->r.data)
: "cc", "%rax");
return res;
}
else if (n == 5)
{
int64_t res;
__asm__
("// check for overflow \n\t"
"mov $0, %[res] \n\t"
ADD_CMP(32)
ADD_CMP(24)
ADD_CMP(16)
ADD_CMP(8)
ADD_CMP(0)
"jmp done%= \n\t"
"subtract%=: \n\t"
"mov $1, %[res] \n\t"
"done%=: \n\t"
: [res] "=&r" (res)
: [A] "r" (other.r.data), [mod] "r" (this->r.data)
: "cc", "%rax");
return res;
}
else
#endif
{
return (mpn_cmp(this->r.data, other.r.data, n) < 0);
}
}
};
template<typename T, typename FieldT>
T naive_exp(typename std::vector<T>::const_iterator vec_start,
typename std::vector<T>::const_iterator vec_end,
typename std::vector<FieldT>::const_iterator scalar_start,
typename std::vector<FieldT>::const_iterator scalar_end)
{
T result(T::zero());
typename std::vector<T>::const_iterator vec_it;
typename std::vector<FieldT>::const_iterator scalar_it;
for (vec_it = vec_start, scalar_it = scalar_start; vec_it != vec_end; ++vec_it, ++scalar_it)
{
bigint<FieldT::num_limbs> scalar_bigint = scalar_it->as_bigint();
result = result + opt_window_wnaf_exp(*vec_it, scalar_bigint, scalar_bigint.num_bits());
}
assert_except(scalar_it == scalar_end);
return result;
}
template<typename T, typename FieldT>
T naive_plain_exp(typename std::vector<T>::const_iterator vec_start,
typename std::vector<T>::const_iterator vec_end,
typename std::vector<FieldT>::const_iterator scalar_start,
typename std::vector<FieldT>::const_iterator scalar_end)
{
T result(T::zero());
typename std::vector<T>::const_iterator vec_it;
typename std::vector<FieldT>::const_iterator scalar_it;
for (vec_it = vec_start, scalar_it = scalar_start; vec_it != vec_end; ++vec_it, ++scalar_it)
{
result = result + (*scalar_it) * (*vec_it);
}
assert_except(scalar_it == scalar_end);
return result;
}
/*
The multi-exponentiation algorithm below is a variant of the Bos-Coster algorithm
[Bos and Coster, "Addition chain heuristics", CRYPTO '89].
The implementation uses suggestions from
[Bernstein, Duif, Lange, Schwabe, and Yang, "High-speed high-security signatures", CHES '11].
*/
template<typename T, typename FieldT>
T multi_exp_inner(typename std::vector<T>::const_iterator vec_start,
typename std::vector<T>::const_iterator vec_end,
typename std::vector<FieldT>::const_iterator scalar_start,
typename std::vector<FieldT>::const_iterator scalar_end)
{
const mp_size_t n = std::remove_reference<decltype(*scalar_start)>::type::num_limbs;
if (vec_start == vec_end)
{
return T::zero();
}
if (vec_start + 1 == vec_end)
{
return (*scalar_start)*(*vec_start);
}
std::vector<ordered_exponent<n> > opt_q;
const size_t vec_len = scalar_end - scalar_start;
const size_t odd_vec_len = (vec_len % 2 == 1 ? vec_len : vec_len + 1);
opt_q.reserve(odd_vec_len);
std::vector<T> g;
g.reserve(odd_vec_len);
typename std::vector<T>::const_iterator vec_it;
typename std::vector<FieldT>::const_iterator scalar_it;
size_t i;
for (i=0, vec_it = vec_start, scalar_it = scalar_start; vec_it != vec_end; ++vec_it, ++scalar_it, ++i)
{
g.emplace_back(*vec_it);
opt_q.emplace_back(ordered_exponent<n>(i, scalar_it->as_bigint()));
}
std::make_heap(opt_q.begin(),opt_q.end());
assert_except(scalar_it == scalar_end);
if (vec_len != odd_vec_len)
{
g.emplace_back(T::zero());
opt_q.emplace_back(ordered_exponent<n>(odd_vec_len - 1, bigint<n>(UINT64_C(0))));
}
assert_except(g.size() % 2 == 1);
assert_except(opt_q.size() == g.size());
T opt_result = T::zero();
while (true)
{
ordered_exponent<n> &a = opt_q[0];
ordered_exponent<n> &b = (opt_q[1] < opt_q[2] ? opt_q[2] : opt_q[1]);
const size_t abits = a.r.num_bits();
if (b.r.is_zero())
{
// opt_result = opt_result + (a.r * g[a.idx]);
opt_result = opt_result + opt_window_wnaf_exp(g[a.idx], a.r, abits);
break;
}
const size_t bbits = b.r.num_bits();
const size_t limit = (abits-bbits >= 20 ? 20 : abits-bbits);
if (bbits < UINT64_C(1)<<limit)
{
/*
In this case, exponentiating to the power of a is cheaper than
subtracting b from a multiple times, so let's do it directly
*/
// opt_result = opt_result + (a.r * g[a.idx]);
opt_result = opt_result + opt_window_wnaf_exp(g[a.idx], a.r, abits);
#ifdef DEBUG
printf("Skipping the following pair (%zu bit number vs %zu bit):\n", abits, bbits);
a.r.print();
b.r.print();
#endif
a.r.clear();
}
else
{
// x A + y B => (x-y) A + y (B+A)
mpn_sub_n(a.r.data, a.r.data, b.r.data, n);
g[b.idx] = g[b.idx] + g[a.idx];
}
// regardless of whether a was cleared or subtracted from we push it down, then take back up
/* heapify A down */
size_t a_pos = 0;
while (2*a_pos + 2< odd_vec_len)
{
// this is a max-heap so to maintain a heap property we swap with the largest of the two
if (opt_q[2*a_pos+1] < opt_q[2*a_pos+2])
{
std::swap(opt_q[a_pos], opt_q[2*a_pos+2]);
a_pos = 2*a_pos+2;
}
else
{
std::swap(opt_q[a_pos], opt_q[2*a_pos+1]);
a_pos = 2*a_pos+1;
}
}
/* now heapify A up appropriate amount of times */
while (a_pos > 0 && opt_q[(a_pos-1)/2] < opt_q[a_pos])
{
std::swap(opt_q[a_pos], opt_q[(a_pos-1)/2]);
a_pos = (a_pos-1) / 2;
}
}
return opt_result;
}
template<typename T, typename FieldT>
T multi_exp(typename std::vector<T>::const_iterator vec_start,
typename std::vector<T>::const_iterator vec_end,
typename std::vector<FieldT>::const_iterator scalar_start,
typename std::vector<FieldT>::const_iterator scalar_end,
const size_t chunks,
const bool use_multiexp)
{
const size_t total = vec_end - vec_start;
if (total < chunks)
{
return naive_exp<T, FieldT>(vec_start, vec_end, scalar_start, scalar_end);
}
const size_t one = total/chunks;
std::vector<T> partial(chunks, T::zero());
if (use_multiexp)
{
#ifdef MULTICORE
#pragma omp parallel for
#endif
for (size_t i = 0; i < chunks; ++i)
{
partial[i] = multi_exp_inner<T, FieldT>(vec_start + i*one,
(i == chunks-1 ? vec_end : vec_start + (i+1)*one),
scalar_start + i*one,
(i == chunks-1 ? scalar_end : scalar_start + (i+1)*one));
}
}
else
{
#ifdef MULTICORE
#pragma omp parallel for
#endif
for (size_t i = 0; i < chunks; ++i)
{
partial[i] = naive_exp<T, FieldT>(vec_start + i*one,
(i == chunks-1 ? vec_end : vec_start + (i+1)*one),
scalar_start + i*one,
(i == chunks-1 ? scalar_end : scalar_start + (i+1)*one));
}
}
T final = T::zero();
for (size_t i = 0; i < chunks; ++i)
{
final = final + partial[i];
}
return final;
}
template<typename T, typename FieldT>
T multi_exp_with_mixed_addition(typename std::vector<T>::const_iterator vec_start,
typename std::vector<T>::const_iterator vec_end,
typename std::vector<FieldT>::const_iterator scalar_start,
typename std::vector<FieldT>::const_iterator scalar_end,
const size_t chunks,
const bool use_multiexp)
{
assert_except(std::distance(vec_start, vec_end) == std::distance(scalar_start, scalar_end));
enter_block("Process scalar vector");
auto value_it = vec_start;
auto scalar_it = scalar_start;
const FieldT zero = FieldT::zero();
const FieldT one = FieldT::one();
std::vector<FieldT> p;
std::vector<T> g;
T acc = T::zero();
size_t num_skip = 0;
size_t num_add = 0;
size_t num_other = 0;
for (; scalar_it != scalar_end; ++scalar_it, ++value_it)
{
if (*scalar_it == zero)
{
// do nothing
++num_skip;
}
else if (*scalar_it == one)
{
#ifdef USE_MIXED_ADDITION
acc = acc.mixed_add(*value_it);
#else
acc = acc + (*value_it);
#endif
++num_add;
}
else
{
p.emplace_back(*scalar_it);
g.emplace_back(*value_it);
++num_other;
}
}
//print_indent(); printf("* Elements of w skipped: %zu (%0.2f%%)\n", num_skip, 100.*num_skip/(num_skip+num_add+num_other));
//print_indent(); printf("* Elements of w processed with special addition: %zu (%0.2f%%)\n", num_add, 100.*num_add/(num_skip+num_add+num_other));
//print_indent(); printf("* Elements of w remaining: %zu (%0.2f%%)\n", num_other, 100.*num_other/(num_skip+num_add+num_other));
leave_block("Process scalar vector");
return acc + multi_exp<T, FieldT>(g.begin(), g.end(), p.begin(), p.end(), chunks, use_multiexp);
}
template<typename T>
size_t get_exp_window_size(const size_t num_scalars)
{
if (T::fixed_base_exp_window_table.empty())
{
#ifdef LOWMEM
return 14;
#else
return 17;
#endif
}
size_t window = 1;
for (int64_t i = T::fixed_base_exp_window_table.size()-1; i >= 0; --i)
{
#ifdef DEBUG
if (!inhibit_profiling_info)
{
printf("%ld %zu %zu\n", i, num_scalars, T::fixed_base_exp_window_table[i]);
}
#endif
if (T::fixed_base_exp_window_table[i] != 0 && num_scalars >= T::fixed_base_exp_window_table[i])
{
window = i+1;
break;
}
}
if (!inhibit_profiling_info)
{
print_indent(); printf("Choosing window size %zu for %zu elements\n", window, num_scalars);
}
#ifdef LOWMEM
window = std::min((size_t)14, window);
#endif
return window;
}
template<typename T>
window_table<T> get_window_table(const size_t scalar_size,
const size_t window,
const T &g)
{
const size_t in_window = UINT64_C(1)<<window;
const size_t outerc = (scalar_size+window-1)/window;
const size_t last_in_window = UINT64_C(1)<<(scalar_size - (outerc-1)*window);
#ifdef DEBUG
if (!inhibit_profiling_info)
{
print_indent(); printf("* scalar_size=%zu; window=%zu; in_window=%zu; outerc=%zu\n", scalar_size, window, in_window, outerc);
}
#endif
window_table<T> powers_of_g(outerc, std::vector<T>(in_window, T::zero()));
T gouter = g;
for (size_t outer = 0; outer < outerc; ++outer)
{
T ginner = T::zero();
size_t cur_in_window = outer == outerc-1 ? last_in_window : in_window;
for (size_t inner = 0; inner < cur_in_window; ++inner)
{
powers_of_g[outer][inner] = ginner;
ginner = ginner + gouter;
}
for (size_t i = 0; i < window; ++i)
{
gouter = gouter + gouter;
}
}
return powers_of_g;
}
template<typename T, typename FieldT>
T windowed_exp(const size_t scalar_size,
const size_t window,
const window_table<T> &powers_of_g,
const FieldT &pow)
{
const size_t outerc = (scalar_size+window-1)/window;
const bigint<FieldT::num_limbs> pow_val = pow.as_bigint();
/* exp */
T res = powers_of_g[0][0];
for (size_t outer = 0; outer < outerc; ++outer)
{
size_t inner = 0;
for (size_t i = 0; i < window; ++i)
{
if (pow_val.test_bit(outer*window + i))
{
inner |= 1u << i;
}
}
res = res + powers_of_g[outer][inner];
}
return res;
}
template<typename T, typename FieldT>
std::vector<T> batch_exp(const size_t scalar_size,
const size_t window,
const window_table<T> &table,
const std::vector<FieldT> &v)
{
if (!inhibit_profiling_info)
{
print_indent();
}
std::vector<T> res(v.size(), table[0][0]);
#ifdef MULTICORE
#pragma omp parallel for
#endif
for (size_t i = 0; i < v.size(); ++i)
{
res[i] = windowed_exp(scalar_size, window, table, v[i]);
if (!inhibit_profiling_info && (i % 10000 == 0))
{
printf(".");
fflush(stdout);
}
}
if (!inhibit_profiling_info)
{
printf(" DONE!\n");
}
return res;
}
template<typename T, typename FieldT>
std::vector<T> batch_exp_with_coeff(const size_t scalar_size,
const size_t window,
const window_table<T> &table,
const FieldT &coeff,
const std::vector<FieldT> &v)
{
if (!inhibit_profiling_info)
{
print_indent();
}
std::vector<T> res(v.size(), table[0][0]);
#ifdef MULTICORE
#pragma omp parallel for
#endif
for (size_t i = 0; i < v.size(); ++i)
{
res[i] = windowed_exp(scalar_size, window, table, coeff * v[i]);
if (!inhibit_profiling_info && (i % 10000 == 0))
{
printf(".");
fflush(stdout);
}
}
if (!inhibit_profiling_info)
{
printf(" DONE!\n");
}
return res;
}
template<typename T>
void batch_to_special(std::vector<T> &vec)
{
enter_block("Batch-convert elements to special form");
std::vector<T> non_zero_vec;
for (size_t i = 0; i < vec.size(); ++i)
{
if (!vec[i].is_zero())
{
non_zero_vec.emplace_back(vec[i]);
}
}
batch_to_special_all_non_zeros<T>(non_zero_vec);
auto it = non_zero_vec.begin();
T zero_special = T::zero();
zero_special.to_special();
for (size_t i = 0; i < vec.size(); ++i)
{
if (!vec[i].is_zero())
{
vec[i] = *it;
++it;
}
else
{
vec[i] = zero_special;
}
}
leave_block("Batch-convert elements to special form");
}
} // libsnark
#endif // MULTIEXP_TCC_