592 lines
17 KiB
C++
592 lines
17 KiB
C++
/** @file
|
|
*****************************************************************************
|
|
|
|
Implementation of interfaces for multi-exponentiation routines.
|
|
|
|
See multiexp.hpp .
|
|
|
|
*****************************************************************************
|
|
* @author This file is part of libsnark, developed by SCIPR Lab
|
|
* and contributors (see AUTHORS).
|
|
* @copyright MIT license (see LICENSE file)
|
|
*****************************************************************************/
|
|
|
|
#ifndef MULTIEXP_TCC_
|
|
#define MULTIEXP_TCC_
|
|
|
|
#include "algebra/fields/fp_aux.tcc"
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <type_traits>
|
|
|
|
#include "common/profiling.hpp"
|
|
#include "common/utils.hpp"
|
|
#include "common/assert_except.hpp"
|
|
#include "algebra/scalar_multiplication/wnaf.hpp"
|
|
|
|
namespace libsnark {
|
|
|
|
template<mp_size_t n>
|
|
class ordered_exponent {
|
|
// to use std::push_heap and friends later
|
|
public:
|
|
size_t idx;
|
|
bigint<n> r;
|
|
|
|
ordered_exponent(const size_t idx, const bigint<n> &r) : idx(idx), r(r) {};
|
|
|
|
bool operator<(const ordered_exponent<n> &other) const
|
|
{
|
|
#if defined(__x86_64__) && defined(USE_ASM)
|
|
if (n == 3)
|
|
{
|
|
int64_t res;
|
|
__asm__
|
|
("// check for overflow \n\t"
|
|
"mov $0, %[res] \n\t"
|
|
ADD_CMP(16)
|
|
ADD_CMP(8)
|
|
ADD_CMP(0)
|
|
"jmp done%= \n\t"
|
|
"subtract%=: \n\t"
|
|
"mov $1, %[res] \n\t"
|
|
"done%=: \n\t"
|
|
: [res] "=&r" (res)
|
|
: [A] "r" (other.r.data), [mod] "r" (this->r.data)
|
|
: "cc", "%rax");
|
|
return res;
|
|
}
|
|
else if (n == 4)
|
|
{
|
|
int64_t res;
|
|
__asm__
|
|
("// check for overflow \n\t"
|
|
"mov $0, %[res] \n\t"
|
|
ADD_CMP(24)
|
|
ADD_CMP(16)
|
|
ADD_CMP(8)
|
|
ADD_CMP(0)
|
|
"jmp done%= \n\t"
|
|
"subtract%=: \n\t"
|
|
"mov $1, %[res] \n\t"
|
|
"done%=: \n\t"
|
|
: [res] "=&r" (res)
|
|
: [A] "r" (other.r.data), [mod] "r" (this->r.data)
|
|
: "cc", "%rax");
|
|
return res;
|
|
}
|
|
else if (n == 5)
|
|
{
|
|
int64_t res;
|
|
__asm__
|
|
("// check for overflow \n\t"
|
|
"mov $0, %[res] \n\t"
|
|
ADD_CMP(32)
|
|
ADD_CMP(24)
|
|
ADD_CMP(16)
|
|
ADD_CMP(8)
|
|
ADD_CMP(0)
|
|
"jmp done%= \n\t"
|
|
"subtract%=: \n\t"
|
|
"mov $1, %[res] \n\t"
|
|
"done%=: \n\t"
|
|
: [res] "=&r" (res)
|
|
: [A] "r" (other.r.data), [mod] "r" (this->r.data)
|
|
: "cc", "%rax");
|
|
return res;
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
return (mpn_cmp(this->r.data, other.r.data, n) < 0);
|
|
}
|
|
}
|
|
};
|
|
|
|
template<typename T, typename FieldT>
|
|
T naive_exp(typename std::vector<T>::const_iterator vec_start,
|
|
typename std::vector<T>::const_iterator vec_end,
|
|
typename std::vector<FieldT>::const_iterator scalar_start,
|
|
typename std::vector<FieldT>::const_iterator scalar_end)
|
|
{
|
|
T result(T::zero());
|
|
|
|
typename std::vector<T>::const_iterator vec_it;
|
|
typename std::vector<FieldT>::const_iterator scalar_it;
|
|
|
|
for (vec_it = vec_start, scalar_it = scalar_start; vec_it != vec_end; ++vec_it, ++scalar_it)
|
|
{
|
|
bigint<FieldT::num_limbs> scalar_bigint = scalar_it->as_bigint();
|
|
result = result + opt_window_wnaf_exp(*vec_it, scalar_bigint, scalar_bigint.num_bits());
|
|
}
|
|
assert_except(scalar_it == scalar_end);
|
|
|
|
return result;
|
|
}
|
|
|
|
template<typename T, typename FieldT>
|
|
T naive_plain_exp(typename std::vector<T>::const_iterator vec_start,
|
|
typename std::vector<T>::const_iterator vec_end,
|
|
typename std::vector<FieldT>::const_iterator scalar_start,
|
|
typename std::vector<FieldT>::const_iterator scalar_end)
|
|
{
|
|
T result(T::zero());
|
|
|
|
typename std::vector<T>::const_iterator vec_it;
|
|
typename std::vector<FieldT>::const_iterator scalar_it;
|
|
|
|
for (vec_it = vec_start, scalar_it = scalar_start; vec_it != vec_end; ++vec_it, ++scalar_it)
|
|
{
|
|
result = result + (*scalar_it) * (*vec_it);
|
|
}
|
|
assert_except(scalar_it == scalar_end);
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
The multi-exponentiation algorithm below is a variant of the Bos-Coster algorithm
|
|
[Bos and Coster, "Addition chain heuristics", CRYPTO '89].
|
|
The implementation uses suggestions from
|
|
[Bernstein, Duif, Lange, Schwabe, and Yang, "High-speed high-security signatures", CHES '11].
|
|
*/
|
|
template<typename T, typename FieldT>
|
|
T multi_exp_inner(typename std::vector<T>::const_iterator vec_start,
|
|
typename std::vector<T>::const_iterator vec_end,
|
|
typename std::vector<FieldT>::const_iterator scalar_start,
|
|
typename std::vector<FieldT>::const_iterator scalar_end)
|
|
{
|
|
const mp_size_t n = std::remove_reference<decltype(*scalar_start)>::type::num_limbs;
|
|
|
|
if (vec_start == vec_end)
|
|
{
|
|
return T::zero();
|
|
}
|
|
|
|
if (vec_start + 1 == vec_end)
|
|
{
|
|
return (*scalar_start)*(*vec_start);
|
|
}
|
|
|
|
std::vector<ordered_exponent<n> > opt_q;
|
|
const size_t vec_len = scalar_end - scalar_start;
|
|
const size_t odd_vec_len = (vec_len % 2 == 1 ? vec_len : vec_len + 1);
|
|
opt_q.reserve(odd_vec_len);
|
|
std::vector<T> g;
|
|
g.reserve(odd_vec_len);
|
|
|
|
typename std::vector<T>::const_iterator vec_it;
|
|
typename std::vector<FieldT>::const_iterator scalar_it;
|
|
size_t i;
|
|
for (i=0, vec_it = vec_start, scalar_it = scalar_start; vec_it != vec_end; ++vec_it, ++scalar_it, ++i)
|
|
{
|
|
g.emplace_back(*vec_it);
|
|
|
|
opt_q.emplace_back(ordered_exponent<n>(i, scalar_it->as_bigint()));
|
|
}
|
|
std::make_heap(opt_q.begin(),opt_q.end());
|
|
assert_except(scalar_it == scalar_end);
|
|
|
|
if (vec_len != odd_vec_len)
|
|
{
|
|
g.emplace_back(T::zero());
|
|
opt_q.emplace_back(ordered_exponent<n>(odd_vec_len - 1, bigint<n>(UINT64_C(0))));
|
|
}
|
|
assert_except(g.size() % 2 == 1);
|
|
assert_except(opt_q.size() == g.size());
|
|
|
|
T opt_result = T::zero();
|
|
|
|
while (true)
|
|
{
|
|
ordered_exponent<n> &a = opt_q[0];
|
|
ordered_exponent<n> &b = (opt_q[1] < opt_q[2] ? opt_q[2] : opt_q[1]);
|
|
|
|
const size_t abits = a.r.num_bits();
|
|
|
|
if (b.r.is_zero())
|
|
{
|
|
// opt_result = opt_result + (a.r * g[a.idx]);
|
|
opt_result = opt_result + opt_window_wnaf_exp(g[a.idx], a.r, abits);
|
|
break;
|
|
}
|
|
|
|
const size_t bbits = b.r.num_bits();
|
|
const size_t limit = (abits-bbits >= 20 ? 20 : abits-bbits);
|
|
|
|
if (bbits < UINT64_C(1)<<limit)
|
|
{
|
|
/*
|
|
In this case, exponentiating to the power of a is cheaper than
|
|
subtracting b from a multiple times, so let's do it directly
|
|
*/
|
|
// opt_result = opt_result + (a.r * g[a.idx]);
|
|
opt_result = opt_result + opt_window_wnaf_exp(g[a.idx], a.r, abits);
|
|
#ifdef DEBUG
|
|
printf("Skipping the following pair (%zu bit number vs %zu bit):\n", abits, bbits);
|
|
a.r.print();
|
|
b.r.print();
|
|
#endif
|
|
a.r.clear();
|
|
}
|
|
else
|
|
{
|
|
// x A + y B => (x-y) A + y (B+A)
|
|
mpn_sub_n(a.r.data, a.r.data, b.r.data, n);
|
|
g[b.idx] = g[b.idx] + g[a.idx];
|
|
}
|
|
|
|
// regardless of whether a was cleared or subtracted from we push it down, then take back up
|
|
|
|
/* heapify A down */
|
|
size_t a_pos = 0;
|
|
while (2*a_pos + 2< odd_vec_len)
|
|
{
|
|
// this is a max-heap so to maintain a heap property we swap with the largest of the two
|
|
if (opt_q[2*a_pos+1] < opt_q[2*a_pos+2])
|
|
{
|
|
std::swap(opt_q[a_pos], opt_q[2*a_pos+2]);
|
|
a_pos = 2*a_pos+2;
|
|
}
|
|
else
|
|
{
|
|
std::swap(opt_q[a_pos], opt_q[2*a_pos+1]);
|
|
a_pos = 2*a_pos+1;
|
|
}
|
|
}
|
|
|
|
/* now heapify A up appropriate amount of times */
|
|
while (a_pos > 0 && opt_q[(a_pos-1)/2] < opt_q[a_pos])
|
|
{
|
|
std::swap(opt_q[a_pos], opt_q[(a_pos-1)/2]);
|
|
a_pos = (a_pos-1) / 2;
|
|
}
|
|
}
|
|
|
|
return opt_result;
|
|
}
|
|
|
|
template<typename T, typename FieldT>
|
|
T multi_exp(typename std::vector<T>::const_iterator vec_start,
|
|
typename std::vector<T>::const_iterator vec_end,
|
|
typename std::vector<FieldT>::const_iterator scalar_start,
|
|
typename std::vector<FieldT>::const_iterator scalar_end,
|
|
const size_t chunks,
|
|
const bool use_multiexp)
|
|
{
|
|
const size_t total = vec_end - vec_start;
|
|
if (total < chunks)
|
|
{
|
|
return naive_exp<T, FieldT>(vec_start, vec_end, scalar_start, scalar_end);
|
|
}
|
|
|
|
const size_t one = total/chunks;
|
|
|
|
std::vector<T> partial(chunks, T::zero());
|
|
|
|
if (use_multiexp)
|
|
{
|
|
#ifdef MULTICORE
|
|
#pragma omp parallel for
|
|
#endif
|
|
for (size_t i = 0; i < chunks; ++i)
|
|
{
|
|
partial[i] = multi_exp_inner<T, FieldT>(vec_start + i*one,
|
|
(i == chunks-1 ? vec_end : vec_start + (i+1)*one),
|
|
scalar_start + i*one,
|
|
(i == chunks-1 ? scalar_end : scalar_start + (i+1)*one));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
#ifdef MULTICORE
|
|
#pragma omp parallel for
|
|
#endif
|
|
for (size_t i = 0; i < chunks; ++i)
|
|
{
|
|
partial[i] = naive_exp<T, FieldT>(vec_start + i*one,
|
|
(i == chunks-1 ? vec_end : vec_start + (i+1)*one),
|
|
scalar_start + i*one,
|
|
(i == chunks-1 ? scalar_end : scalar_start + (i+1)*one));
|
|
}
|
|
}
|
|
|
|
T final = T::zero();
|
|
|
|
for (size_t i = 0; i < chunks; ++i)
|
|
{
|
|
final = final + partial[i];
|
|
}
|
|
|
|
return final;
|
|
}
|
|
|
|
template<typename T, typename FieldT>
|
|
T multi_exp_with_mixed_addition(typename std::vector<T>::const_iterator vec_start,
|
|
typename std::vector<T>::const_iterator vec_end,
|
|
typename std::vector<FieldT>::const_iterator scalar_start,
|
|
typename std::vector<FieldT>::const_iterator scalar_end,
|
|
const size_t chunks,
|
|
const bool use_multiexp)
|
|
{
|
|
assert_except(std::distance(vec_start, vec_end) == std::distance(scalar_start, scalar_end));
|
|
enter_block("Process scalar vector");
|
|
auto value_it = vec_start;
|
|
auto scalar_it = scalar_start;
|
|
|
|
const FieldT zero = FieldT::zero();
|
|
const FieldT one = FieldT::one();
|
|
std::vector<FieldT> p;
|
|
std::vector<T> g;
|
|
|
|
T acc = T::zero();
|
|
|
|
size_t num_skip = 0;
|
|
size_t num_add = 0;
|
|
size_t num_other = 0;
|
|
|
|
for (; scalar_it != scalar_end; ++scalar_it, ++value_it)
|
|
{
|
|
if (*scalar_it == zero)
|
|
{
|
|
// do nothing
|
|
++num_skip;
|
|
}
|
|
else if (*scalar_it == one)
|
|
{
|
|
#ifdef USE_MIXED_ADDITION
|
|
acc = acc.mixed_add(*value_it);
|
|
#else
|
|
acc = acc + (*value_it);
|
|
#endif
|
|
++num_add;
|
|
}
|
|
else
|
|
{
|
|
p.emplace_back(*scalar_it);
|
|
g.emplace_back(*value_it);
|
|
++num_other;
|
|
}
|
|
}
|
|
//print_indent(); printf("* Elements of w skipped: %zu (%0.2f%%)\n", num_skip, 100.*num_skip/(num_skip+num_add+num_other));
|
|
//print_indent(); printf("* Elements of w processed with special addition: %zu (%0.2f%%)\n", num_add, 100.*num_add/(num_skip+num_add+num_other));
|
|
//print_indent(); printf("* Elements of w remaining: %zu (%0.2f%%)\n", num_other, 100.*num_other/(num_skip+num_add+num_other));
|
|
|
|
leave_block("Process scalar vector");
|
|
|
|
return acc + multi_exp<T, FieldT>(g.begin(), g.end(), p.begin(), p.end(), chunks, use_multiexp);
|
|
}
|
|
|
|
template<typename T>
|
|
size_t get_exp_window_size(const size_t num_scalars)
|
|
{
|
|
if (T::fixed_base_exp_window_table.empty())
|
|
{
|
|
#ifdef LOWMEM
|
|
return 14;
|
|
#else
|
|
return 17;
|
|
#endif
|
|
}
|
|
size_t window = 1;
|
|
for (int64_t i = T::fixed_base_exp_window_table.size()-1; i >= 0; --i)
|
|
{
|
|
#ifdef DEBUG
|
|
if (!inhibit_profiling_info)
|
|
{
|
|
printf("%ld %zu %zu\n", i, num_scalars, T::fixed_base_exp_window_table[i]);
|
|
}
|
|
#endif
|
|
if (T::fixed_base_exp_window_table[i] != 0 && num_scalars >= T::fixed_base_exp_window_table[i])
|
|
{
|
|
window = i+1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!inhibit_profiling_info)
|
|
{
|
|
print_indent(); printf("Choosing window size %zu for %zu elements\n", window, num_scalars);
|
|
}
|
|
|
|
#ifdef LOWMEM
|
|
window = std::min((size_t)14, window);
|
|
#endif
|
|
return window;
|
|
}
|
|
|
|
template<typename T>
|
|
window_table<T> get_window_table(const size_t scalar_size,
|
|
const size_t window,
|
|
const T &g)
|
|
{
|
|
const size_t in_window = UINT64_C(1)<<window;
|
|
const size_t outerc = (scalar_size+window-1)/window;
|
|
const size_t last_in_window = UINT64_C(1)<<(scalar_size - (outerc-1)*window);
|
|
#ifdef DEBUG
|
|
if (!inhibit_profiling_info)
|
|
{
|
|
print_indent(); printf("* scalar_size=%zu; window=%zu; in_window=%zu; outerc=%zu\n", scalar_size, window, in_window, outerc);
|
|
}
|
|
#endif
|
|
|
|
window_table<T> powers_of_g(outerc, std::vector<T>(in_window, T::zero()));
|
|
|
|
T gouter = g;
|
|
|
|
for (size_t outer = 0; outer < outerc; ++outer)
|
|
{
|
|
T ginner = T::zero();
|
|
size_t cur_in_window = outer == outerc-1 ? last_in_window : in_window;
|
|
for (size_t inner = 0; inner < cur_in_window; ++inner)
|
|
{
|
|
powers_of_g[outer][inner] = ginner;
|
|
ginner = ginner + gouter;
|
|
}
|
|
|
|
for (size_t i = 0; i < window; ++i)
|
|
{
|
|
gouter = gouter + gouter;
|
|
}
|
|
}
|
|
|
|
return powers_of_g;
|
|
}
|
|
|
|
template<typename T, typename FieldT>
|
|
T windowed_exp(const size_t scalar_size,
|
|
const size_t window,
|
|
const window_table<T> &powers_of_g,
|
|
const FieldT &pow)
|
|
{
|
|
const size_t outerc = (scalar_size+window-1)/window;
|
|
const bigint<FieldT::num_limbs> pow_val = pow.as_bigint();
|
|
|
|
/* exp */
|
|
T res = powers_of_g[0][0];
|
|
|
|
for (size_t outer = 0; outer < outerc; ++outer)
|
|
{
|
|
size_t inner = 0;
|
|
for (size_t i = 0; i < window; ++i)
|
|
{
|
|
if (pow_val.test_bit(outer*window + i))
|
|
{
|
|
inner |= 1u << i;
|
|
}
|
|
}
|
|
|
|
res = res + powers_of_g[outer][inner];
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
template<typename T, typename FieldT>
|
|
std::vector<T> batch_exp(const size_t scalar_size,
|
|
const size_t window,
|
|
const window_table<T> &table,
|
|
const std::vector<FieldT> &v)
|
|
{
|
|
if (!inhibit_profiling_info)
|
|
{
|
|
print_indent();
|
|
}
|
|
std::vector<T> res(v.size(), table[0][0]);
|
|
|
|
#ifdef MULTICORE
|
|
#pragma omp parallel for
|
|
#endif
|
|
for (size_t i = 0; i < v.size(); ++i)
|
|
{
|
|
res[i] = windowed_exp(scalar_size, window, table, v[i]);
|
|
|
|
if (!inhibit_profiling_info && (i % 10000 == 0))
|
|
{
|
|
printf(".");
|
|
fflush(stdout);
|
|
}
|
|
}
|
|
|
|
if (!inhibit_profiling_info)
|
|
{
|
|
printf(" DONE!\n");
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
template<typename T, typename FieldT>
|
|
std::vector<T> batch_exp_with_coeff(const size_t scalar_size,
|
|
const size_t window,
|
|
const window_table<T> &table,
|
|
const FieldT &coeff,
|
|
const std::vector<FieldT> &v)
|
|
{
|
|
if (!inhibit_profiling_info)
|
|
{
|
|
print_indent();
|
|
}
|
|
std::vector<T> res(v.size(), table[0][0]);
|
|
|
|
#ifdef MULTICORE
|
|
#pragma omp parallel for
|
|
#endif
|
|
for (size_t i = 0; i < v.size(); ++i)
|
|
{
|
|
res[i] = windowed_exp(scalar_size, window, table, coeff * v[i]);
|
|
|
|
if (!inhibit_profiling_info && (i % 10000 == 0))
|
|
{
|
|
printf(".");
|
|
fflush(stdout);
|
|
}
|
|
}
|
|
|
|
if (!inhibit_profiling_info)
|
|
{
|
|
printf(" DONE!\n");
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
template<typename T>
|
|
void batch_to_special(std::vector<T> &vec)
|
|
{
|
|
enter_block("Batch-convert elements to special form");
|
|
|
|
std::vector<T> non_zero_vec;
|
|
for (size_t i = 0; i < vec.size(); ++i)
|
|
{
|
|
if (!vec[i].is_zero())
|
|
{
|
|
non_zero_vec.emplace_back(vec[i]);
|
|
}
|
|
}
|
|
|
|
batch_to_special_all_non_zeros<T>(non_zero_vec);
|
|
auto it = non_zero_vec.begin();
|
|
T zero_special = T::zero();
|
|
zero_special.to_special();
|
|
|
|
for (size_t i = 0; i < vec.size(); ++i)
|
|
{
|
|
if (!vec[i].is_zero())
|
|
{
|
|
vec[i] = *it;
|
|
++it;
|
|
}
|
|
else
|
|
{
|
|
vec[i] = zero_special;
|
|
}
|
|
}
|
|
leave_block("Batch-convert elements to special form");
|
|
}
|
|
|
|
} // libsnark
|
|
|
|
#endif // MULTIEXP_TCC_
|