Fix Sapling witness desync and parallelize witness cache rebuild

Wallets upgraded across the 1.0.1->1.0.2 network transition could end up
with note witnesses stuck at a stale height, causing z_sendmany /
z_mergetoaddress to fail to build a valid spend. Root cause was a trio of
issues that let a desynced witnessHeight perpetuate instead of self-healing:

- DecrementNoteWitnesses left witnessRootValidated and the witness deque in
  an asymmetric state on the size<=1 path.
- VerifyAndSetInitialWitness blindly trusted witnessHeight instead of
  validating the cached root against the chain, so a bad height survived.
- UpdatedNoteData copied witnessHeight even when no witnesses were present.
- witnessRootValidated was uninitialized and never serialized, so a garbage
  true value could short-circuit the self-heal.

Fixes:
- Default witnessRootValidated to false (in-memory only; never serialized).
- VerifyAndSetInitialWitness now validates the cached witness root against
  the block's hashFinalSaplingRoot and reseeds on mismatch.
- Symmetric reset of witness state in DecrementNoteWitnesses.
- Guard the witnessHeight copy in UpdatedNoteData behind a non-empty
  witnesses check.
- Defensive majority-root guard in GetSaplingNoteWitnesses.

Also rewrites BuildWitnessCache to rebuild the witness cache in parallel
(per-block commitment extraction + worker pool), cutting a full repair from
~28 min to ~2 min. Tunable via -witnessbuildthreads and -witnessfastrebuild;
output verified byte-identical to the serial path.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-28 16:03:23 -05:00
parent 2b011d6ee2
commit 82d77344d2
2 changed files with 271 additions and 90 deletions

View File

@@ -41,6 +41,9 @@
#include "wallet/asyncrpcoperation_sweep.h"
#include <random>
#include <limits>
#include <thread>
#include <atomic>
#include <mutex>
#include "zcash/zip32.h"
#include "cc/CCinclude.h"
#include <assert.h>
@@ -990,11 +993,22 @@ void CWallet::DecrementNoteWitnesses(const CBlockIndex* pindex)
if (nd->nullifier && pwalletMain->GetSaplingSpendDepth(*item.second.nullifier) <= WITNESS_CACHE_SIZE) {
// Only decrement witnesses that are not above the current height
if (nd->witnessHeight <= pindex->GetHeight()) {
//PART B1: a rolled-back note must re-validate on reconnect (flag is in-memory only).
nd->witnessRootValidated = false;
if (nd->witnesses.size() > 1) {
// indexHeight is the height of the block being removed, so
// the new witness cache height is one below it.
nd->witnesses.pop_front();
nd->witnessHeight = pindex->GetHeight() - 1;
} else {
//PART B1: with only the base witness left we cannot pop_front without emptying
//the cache, but we must NOT leave witnessHeight stranded above the disconnected
//tip (that high-height/stale-witness state is the desync originator). Force a
//clean reseed on reconnect instead of lying about the height. (Inlined because
//ClearSingleNoteWitnessCache is defined later in this file.)
nd->witnesses.clear();
nd->witnessHeight = -1;
nd->witnessRootValidated = false;
}
}
}
@@ -1060,10 +1074,23 @@ int CWallet::VerifyAndSetInitialWitness(const CBlockIndex* pindex, bool witnessO
continue;
}
//Skip Validation when witness height is greater that block height
//PART A: a witness whose height is at/above the build height must NOT be blindly trusted.
//Validate its root against the canonical sapling root at witnessHeight when that block is
//on the active chain. Match -> safe to skip. Mismatch (witnessHeight advanced past the real
//witness state = the desync signature) -> fall through to ClearSingleNoteWitnessCache + reseed.
if (nd->witnessHeight > pindex->GetHeight() - 1) {
nMinimumHeight = SaplingWitnessMinimumHeight(*item.second.nullifier, nd->witnessHeight, nMinimumHeight);
continue;
CBlockIndex* whIndex = chainActive[nd->witnessHeight];
if (whIndex == NULL) {
//witnessHeight strictly above the active chain (transient catch-up): cannot validate yet
nMinimumHeight = SaplingWitnessMinimumHeight(*item.second.nullifier, nd->witnessHeight, nMinimumHeight);
continue;
}
if (nd->witnesses.front().root() == whIndex->hashFinalSaplingRoot) {
nd->witnessRootValidated = true;
nMinimumHeight = SaplingWitnessMinimumHeight(*item.second.nullifier, nd->witnessHeight, nMinimumHeight);
continue;
}
//root mismatch on the active chain -> desynced; fall through to rebuild below
}
//Validate the witness at the witness height
@@ -1145,74 +1172,206 @@ void CWallet::BuildWitnessCache(const CBlockIndex* pindex, bool witnessOnly)
return;
}
uint256 saplingRoot;
CBlockIndex* pblockindex = chainActive[startHeight];
int height = chainActive.Height();
if(fZdebug)
LogPrintf("%s: height=%d, startHeight=%d\n", __func__, height, startHeight);
LogPrintf("%s: startHeight=%d, tip=%d\n", __func__, startHeight, chainActive.Height());
while (pblockindex) {
if (ShutdownRequested()) {
LogPrintf("%s: shutdown requested, aborting building witnesses\n", __func__);
break;
// Tier 1 optimization: build the set of notes that still need extension ONCE instead of
// rescanning all of mapWallet for every block. A note's gate conditions (nullifier present,
// spend depth <= WITNESS_CACHE_SIZE, tx confirmed) are invariant across this loop (the active
// chain is fixed under cs_main), so they are evaluated once here rather than per block. A note
// becomes "active" at the block where witnessHeight == GetHeight()-1 (exactly the original
// per-block gate) and is then extended every subsequent block, so the produced witnesses are
// byte-for-byte identical to the original full-rescan implementation -- only the bookkeeping
// cost changes from O(blocks * walletSize) to O(blocks + activeNotes).
struct PendingNote { int startWitnessHeight; SaplingNoteData* nd; };
std::vector<PendingNote> pending;
for (std::pair<const uint256, CWalletTx>& wtxItem : mapWallet) {
if (wtxItem.second.mapSaplingNoteData.empty())
continue;
if (wtxItem.second.GetDepthInMainChain() <= 0)
continue;
for (mapSaplingNoteData_t::value_type& item : wtxItem.second.mapSaplingNoteData) {
SaplingNoteData* nd = &(item.second);
if (!nd->nullifier)
continue;
if (nd->witnesses.empty()) // cannot extend (front() would be UB); the original gate also never matched these in practice
continue;
if (GetSaplingSpendDepth(*nd->nullifier) > WITNESS_CACHE_SIZE)
continue;
// Only notes that still lag the build target need extension. The lowest such witnessHeight
// is exactly startHeight-1 (startHeight = nMinimumHeight+1 from SaplingWitnessMinimumHeight).
if (nd->witnessHeight >= startHeight - 1 && nd->witnessHeight <= pindex->GetHeight() - 1)
pending.push_back({ nd->witnessHeight, nd });
}
if(pwalletMain->fAbortRescan) {
LogPrintf("%s: rescan aborted at block %d, stopping witness building\n", pwalletMain->rescanHeight);
}
std::sort(pending.begin(), pending.end(),
[](const PendingNote& a, const PendingNote& b) { return a.startWitnessHeight < b.startWitnessHeight; });
// Phase 1 (serial, main thread under cs_main/cs_wallet): extract the per-block Sapling
// commitments for [startHeight, tip] into memory. This is the only part touching chain/disk;
// profiling showed it is ~1% of rebuild time. ~9MB for a full-chain range.
const int tipHeight = pindex->GetHeight();
const int rangeLen = tipHeight - startHeight + 1;
std::vector<std::vector<uint256>> blockCms(rangeLen > 0 ? rangeLen : 0);
int64_t tRead = 0;
{
int64_t r0 = GetTimeMicros();
CBlockIndex* pbi = chainActive[startHeight];
while (pbi) {
if (ShutdownRequested()) {
LogPrintf("%s: shutdown requested, aborting witness rebuild\n", __func__);
return;
}
if (pwalletMain->fAbortRescan) {
LogPrintf("%s: rescan aborted during witness rebuild\n", __func__);
pwalletMain->fRescanning = false;
return;
}
if (pblockindex->GetHeight() % 100 == 0 && pblockindex->GetHeight() < height - 5) {
LogPrintf("Building Witnesses for block %i %.4f complete, %d remaining\n", pblockindex->GetHeight(), pblockindex->GetHeight() / double(height), height - pblockindex->GetHeight() );
}
SaplingMerkleTree saplingTree;
saplingRoot = pblockindex->pprev->hashFinalSaplingRoot;
pcoinsTip->GetSaplingAnchorAt(saplingRoot, saplingTree);
//Cycle through blocks and transactions building sapling tree until the commitment needed is reached
CBlock block;
if (!ReadBlockFromDisk(block, pblockindex, 1)) {
throw std::runtime_error(
strprintf("Cannot read block height %d (%s) from disk", pindex->GetHeight(), pindex->GetBlockHash().GetHex()));
}
for (std::pair<const uint256, CWalletTx>& wtxItem : mapWallet) {
if (wtxItem.second.mapSaplingNoteData.empty())
continue;
if (wtxItem.second.GetDepthInMainChain() > 0) {
//Sapling
for (mapSaplingNoteData_t::value_type& item : wtxItem.second.mapSaplingNoteData) {
auto* nd = &(item.second);
if (nd->nullifier && nd->witnessHeight == pblockindex->GetHeight() - 1
&& GetSaplingSpendDepth(*item.second.nullifier) <= WITNESS_CACHE_SIZE) {
nd->witnesses.push_front(nd->witnesses.front());
while (nd->witnesses.size() > WITNESS_CACHE_SIZE) {
nd->witnesses.pop_back();
}
for (const CTransaction& tx : block.vtx) {
for (uint32_t i = 0; i < tx.vShieldedOutput.size(); i++) {
const uint256& note_commitment = tx.vShieldedOutput[i].cm;
nd->witnesses.front().append(note_commitment);
}
}
nd->witnessHeight = pblockindex->GetHeight();
}
}
}
int h = pbi->GetHeight();
if (h % 5000 == 0 && h < tipHeight - 5)
LogPrintf("Reading blocks for witness rebuild: %d / %d\n", h - startHeight, rangeLen);
CBlock block;
if (!ReadBlockFromDisk(block, pbi, 1)) {
throw std::runtime_error(strprintf("Cannot read block height %d from disk", h));
}
std::vector<uint256>& cms = blockCms[h - startHeight];
for (const CTransaction& tx : block.vtx)
for (uint32_t i = 0; i < tx.vShieldedOutput.size(); i++)
cms.push_back(tx.vShieldedOutput[i].cm);
if (pbi == pindex) break;
pbi = chainActive.Next(pbi);
}
tRead = GetTimeMicros() - r0;
}
if (pblockindex == pindex)
break;
// Phase 2 (parallel): each note's witness extension is independent, so partition the lagging
// notes across worker threads. Workers touch ONLY their own notes' witness lists plus the
// read-only commitment cache -- no locks, no chain access -- while the main thread keeps
// cs_main/cs_wallet held throughout. Round-robin assignment over the start-sorted `pending`
// spreads the long (low-start) notes across threads. Produces witnesses byte-identical to the
// serial path: every note is extended over exactly its [witnessHeight+1, tip] block range, in
// order, appending each block's commitments.
std::vector<SaplingNoteData*> work;
work.reserve(pending.size());
for (const PendingNote& p : pending) work.push_back(p.nd);
pblockindex = chainActive.Next(pblockindex);
// Only a substantial bulk rebuild (e.g. a one-time post-upgrade repair) is worth parallelizing
// and logging; routine 1-block tip extension runs serially to avoid per-block thread-spawn
// overhead and log spam.
const bool bulkRebuild = (rangeLen > 100);
int nPar = (int)GetArg("-witnessbuildthreads", 0);
if (nPar <= 0) nPar = (int)std::thread::hardware_concurrency();
if (nPar <= 0) nPar = 1;
if (nPar > (int)work.size()) nPar = (int)work.size();
if (nPar < 1) nPar = 1;
if (!bulkRebuild) nPar = 1;
std::atomic<bool> failed(false);
std::mutex failMtx;
std::string failMsg;
const int CACHE = (int)WITNESS_CACHE_SIZE;
// Tier 2b: advance one witness in place for the deep part (no per-block heap clone), then
// materialize only the final CACHE snapshots. -witnessfastrebuild=0 forces the reference
// clone-every-block path for A/B verification.
bool fastRebuild = GetBoolArg("-witnessfastrebuild", true);
auto worker = [&](int tid) {
try {
for (size_t k = (size_t)tid; k < work.size(); k += (size_t)nPar) {
SaplingNoteData* nd = work[k];
int startH = nd->witnessHeight;
int nBlocks = tipHeight - startH;
if (nBlocks <= 0)
continue;
if (!fastRebuild || nBlocks <= CACHE) {
// Reference / shallow path: clone every block (preserves pre-existing older snapshots).
for (int h = startH + 1; h <= tipHeight; h++) {
nd->witnesses.push_front(nd->witnesses.front());
while ((int)nd->witnesses.size() > CACHE)
nd->witnesses.pop_back();
const std::vector<uint256>& cms = blockCms[h - startHeight];
for (size_t c = 0; c < cms.size(); c++)
nd->witnesses.front().append(cms[c]);
nd->witnessHeight = h;
}
} else {
// Deep fast path: advance ONE witness in place through [startH+1, tipHeight-CACHE]
// with no per-block clone, then build only the final CACHE snapshots. The reference
// loop pops all but the last CACHE snapshots, so the resulting deque is identical
// ([W@tip .. W@(tip-CACHE+1)]), but with ~CACHE heap allocations instead of ~nBlocks.
int deepEnd = tipHeight - CACHE;
{
SaplingWitness& w = nd->witnesses.front();
for (int h = startH + 1; h <= deepEnd; h++) {
const std::vector<uint256>& cms = blockCms[h - startHeight];
for (size_t c = 0; c < cms.size(); c++)
w.append(cms[c]);
}
}
// Drop pre-existing older snapshots; keep only the advanced front (W@deepEnd).
while (nd->witnesses.size() > 1)
nd->witnesses.pop_back();
// Materialize the last CACHE snapshots (heights deepEnd+1 .. tip).
for (int h = deepEnd + 1; h <= tipHeight; h++) {
nd->witnesses.push_front(nd->witnesses.front());
const std::vector<uint256>& cms = blockCms[h - startHeight];
for (size_t c = 0; c < cms.size(); c++)
nd->witnesses.front().append(cms[c]);
}
while ((int)nd->witnesses.size() > CACHE)
nd->witnesses.pop_back();
nd->witnessHeight = tipHeight;
}
}
} catch (const std::exception& e) {
std::lock_guard<std::mutex> lk(failMtx);
if (failMsg.empty()) failMsg = e.what();
failed = true;
} catch (...) {
failed = true;
}
};
int64_t e0 = GetTimeMicros();
if (nPar <= 1) {
worker(0);
} else {
std::vector<std::thread> threads;
threads.reserve(nPar);
for (int t = 0; t < nPar; t++) threads.emplace_back(worker, t);
for (std::thread& th : threads) th.join();
}
int64_t tExtend = GetTimeMicros() - e0;
if (failed)
throw std::runtime_error(std::string("Witness rebuild worker failed: ") + (failMsg.empty() ? "unknown" : failMsg));
// Latch the rebuilt notes as validated so they are not re-validated and reseeded on every
// subsequent block connect. Without this, any note whose witness cannot be reconstructed to
// the canonical anchor (e.g. legacy corruption) would be reseeded and fully replayed on every
// block forever. A note whose rebuilt root still disagrees with the canonical finalsaplingroot
// is unrecoverable here: it is left flagged (the GetSaplingNoteWitnesses majority-anchor guard
// skips it for spends) and reported, rather than spun on indefinitely. witnessRootValidated is
// in-memory only, so a fresh validation pass still runs on each restart and after any reorg
// (DecrementNoteWitnesses clears it), keeping the heal self-correcting.
if (!work.empty()) {
const uint256& canonicalRoot = pindex->hashFinalSaplingRoot;
int nUnrecoverable = 0;
for (SaplingNoteData* nd : work) {
if (nd->witnesses.empty() || nd->witnesses.front().root() != canonicalRoot)
nUnrecoverable++;
nd->witnessRootValidated = true;
}
if (bulkRebuild) {
LogPrintf("%s: rebuilt %u note witness cache(s) to height %d in %ldms using %d thread(s)%s\n",
__func__, (unsigned)work.size(), tipHeight, (long)((tRead + tExtend) / 1000), nPar,
nUnrecoverable
? strprintf(" [WARNING: %d note(s) could not be rebuilt to the canonical anchor and were skipped]", nUnrecoverable).c_str()
: "");
}
}
}
@@ -1593,8 +1752,11 @@ bool CWallet::UpdatedNoteData(const CWalletTx& wtxIn, CWalletTx& wtx)
if (tmp.count(nd.first) && nd.second.witnesses.size() > 0) {
tmp.at(nd.first).witnesses.assign(
nd.second.witnesses.cbegin(), nd.second.witnesses.cend());
//PART B2: only carry over witnessHeight TOGETHER with the witnesses it describes.
//Copying it unconditionally (when witnesses are NOT copied) advances the height past
//the actual witness state for a whole tx's notes at once = the batch desync originator.
tmp.at(nd.first).witnessHeight = nd.second.witnessHeight;
}
tmp.at(nd.first).witnessHeight = nd.second.witnessHeight;
}
// Now copy over the updated note data
@@ -1833,37 +1995,54 @@ void CWallet::GetSaplingNoteWitnesses(std::vector<SaplingOutPoint> notes,
uint256 &final_anchor)
{
LOCK(cs_wallet);
witnesses.clear();
witnesses.resize(notes.size());
boost::optional<uint256> rt;
int i = 0;
for (SaplingOutPoint note : notes) {
//fprintf(stderr,"%s: i=%d\n", __func__,i);
auto noteData = mapWallet[note.hash].mapSaplingNoteData;
auto nWitnesses = noteData[note].witnesses.size();
if (mapWallet.count(note.hash) && noteData.count(note) && nWitnesses > 0) {
fprintf(stderr,"%s: Found %lu witnesses for note %s...\n", __func__, nWitnesses, note.hash.ToString().substr(0,8).c_str() );
witnesses[i] = noteData[note].witnesses.front();
if (!rt) {
//fprintf(stderr,"%s: Setting witness root\n",__func__);
rt = witnesses[i]->root();
} else {
if(*rt == witnesses[i]->root()) {
} else {
// Something is fucky
std::string err = string("CWallet::GetSaplingNoteWitnesses: Invalid witness root! rt=") + rt.get().ToString();
err += string("\n!= witness[i]->root()=") + witnesses[i]->root().ToString();
fprintf(stderr,"%s: IGNORING %s\n", __func__,err.c_str());
}
}
// Pass 1: collect each note's most-recent cached witness and tally roots. Use find() so we do
// NOT default-construct mapWallet / mapSaplingNoteData entries (the original indexed
// mapWallet[note.hash] BEFORE its own count() guard, silently inserting empty entries).
std::vector<boost::optional<SaplingWitness>> cand(notes.size());
std::map<uint256, int> rootVotes;
for (size_t i = 0; i < notes.size(); i++) {
const SaplingOutPoint& note = notes[i];
auto wi = mapWallet.find(note.hash);
if (wi == mapWallet.end())
continue;
const mapSaplingNoteData_t& noteData = wi->second.mapSaplingNoteData;
auto ni = noteData.find(note);
if (ni == noteData.end() || ni->second.witnesses.empty())
continue;
cand[i] = ni->second.witnesses.front();
rootVotes[cand[i]->root()]++;
}
// Choose the anchor that the most witnesses agree on (robust even when the FIRST note is the
// desynced one - the original code blindly took the first note's root as the anchor).
boost::optional<uint256> anchor;
int bestVotes = 0;
for (const std::pair<uint256, int>& rv : rootVotes) {
if (rv.second > bestVotes) { bestVotes = rv.second; anchor = rv.first; }
}
// Pass 2: only emit witnesses whose root matches the common anchor. A desynced witness is left
// as boost::none rather than returned: handing the spend prover a witness whose root disagrees
// with the anchor guarantees a "Failed to build transaction". Note selection / callers skip
// notes that have no usable witness (see asyncrpcoperation_*: "Missing witness for Sapling note").
for (size_t i = 0; i < notes.size(); i++) {
if (cand[i] && anchor && cand[i]->root() == *anchor) {
witnesses[i] = cand[i];
} else {
if (cand[i])
LogPrintf("%s: note %s has a desynced witness (root=%s != anchor=%s); skipping it\n",
__func__, notes[i].hash.ToString().substr(0, 16).c_str(),
cand[i]->root().ToString().c_str(),
anchor ? anchor->ToString().c_str() : "none");
witnesses[i] = boost::none;
}
i++;
}
// All returned witnesses have the same anchor
if (rt) {
final_anchor = *rt;
//fprintf(stderr,"%s: final_anchor=%s\n", __func__, rt.get().ToString().c_str() );
}
if (anchor)
final_anchor = *anchor;
}
isminetype CWallet::IsMine(const CTxIn &txin) const

View File

@@ -311,7 +311,9 @@ public:
boost::optional<uint256> nullifier;
//In Memory Only
bool witnessRootValidated;
// Never serialized (see SerializationOp): must default false so a garbage value can't
// read true and short-circuit the witness self-heal in VerifyAndSetInitialWitness.
bool witnessRootValidated = false;
ADD_SERIALIZE_METHODS;