Fix Sapling witness desync and parallelize witness cache rebuild

Wallets upgraded across the 1.0.1->1.0.2 network transition could end up with note witnesses stuck at a stale height, causing z_sendmany / z_mergetoaddress to fail to build a valid spend. Root cause was a trio of issues that let a desynced witnessHeight perpetuate instead of self-healing: - DecrementNoteWitnesses left witnessRootValidated and the witness deque in an asymmetric state on the size<=1 path. - VerifyAndSetInitialWitness blindly trusted witnessHeight instead of validating the cached root against the chain, so a bad height survived. - UpdatedNoteData copied witnessHeight even when no witnesses were present. - witnessRootValidated was uninitialized and never serialized, so a garbage true value could short-circuit the self-heal. Fixes: - Default witnessRootValidated to false (in-memory only; never serialized). - VerifyAndSetInitialWitness now validates the cached witness root against the block's hashFinalSaplingRoot and reseeds on mismatch. - Symmetric reset of witness state in DecrementNoteWitnesses. - Guard the witnessHeight copy in UpdatedNoteData behind a non-empty witnesses check. - Defensive majority-root guard in GetSaplingNoteWitnesses. Also rewrites BuildWitnessCache to rebuild the witness cache in parallel (per-block commitment extraction + worker pool), cutting a full repair from ~28 min to ~2 min. Tunable via -witnessbuildthreads and -witnessfastrebuild; output verified byte-identical to the serial path. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 16:03:23 -05:00
parent 2b011d6ee2
commit 82d77344d2
2 changed files with 271 additions and 90 deletions
--- a/src/wallet/wallet.cpp
+++ b/src/wallet/wallet.cpp
@@ -41,6 +41,9 @@
 #include "wallet/asyncrpcoperation_sweep.h"
 #include <random>
 #include <limits>
+#include <thread>
+#include <atomic>
+#include <mutex>
 #include "zcash/zip32.h"
 #include "cc/CCinclude.h"
 #include <assert.h>
@@ -990,11 +993,22 @@ void CWallet::DecrementNoteWitnesses(const CBlockIndex* pindex)
            if (nd->nullifier && pwalletMain->GetSaplingSpendDepth(*item.second.nullifier) <= WITNESS_CACHE_SIZE) {
                // Only decrement witnesses that are not above the current height
                if (nd->witnessHeight <= pindex->GetHeight()) {
+                    //PART B1: a rolled-back note must re-validate on reconnect (flag is in-memory only).
+                    nd->witnessRootValidated = false;
                    if (nd->witnesses.size() > 1) {
                        // indexHeight is the height of the block being removed, so
                        // the new witness cache height is one below it.
                        nd->witnesses.pop_front();
                        nd->witnessHeight = pindex->GetHeight() - 1;
+                    } else {
+                        //PART B1: with only the base witness left we cannot pop_front without emptying
+                        //the cache, but we must NOT leave witnessHeight stranded above the disconnected
+                        //tip (that high-height/stale-witness state is the desync originator). Force a
+                        //clean reseed on reconnect instead of lying about the height. (Inlined because
+                        //ClearSingleNoteWitnessCache is defined later in this file.)
+                        nd->witnesses.clear();
+                        nd->witnessHeight = -1;
+                        nd->witnessRootValidated = false;
                    }
                }
            }
@@ -1060,10 +1074,23 @@ int CWallet::VerifyAndSetInitialWitness(const CBlockIndex* pindex, bool witnessO
            continue;
          }

-          //Skip Validation when witness height is greater that block height
+          //PART A: a witness whose height is at/above the build height must NOT be blindly trusted.
+          //Validate its root against the canonical sapling root at witnessHeight when that block is
+          //on the active chain. Match -> safe to skip. Mismatch (witnessHeight advanced past the real
+          //witness state = the desync signature) -> fall through to ClearSingleNoteWitnessCache + reseed.
          if (nd->witnessHeight > pindex->GetHeight() - 1) {
-            nMinimumHeight = SaplingWitnessMinimumHeight(*item.second.nullifier, nd->witnessHeight, nMinimumHeight);
-            continue;
+            CBlockIndex* whIndex = chainActive[nd->witnessHeight];
+            if (whIndex == NULL) {
+              //witnessHeight strictly above the active chain (transient catch-up): cannot validate yet
+              nMinimumHeight = SaplingWitnessMinimumHeight(*item.second.nullifier, nd->witnessHeight, nMinimumHeight);
+              continue;
+            }
+            if (nd->witnesses.front().root() == whIndex->hashFinalSaplingRoot) {
+              nd->witnessRootValidated = true;
+              nMinimumHeight = SaplingWitnessMinimumHeight(*item.second.nullifier, nd->witnessHeight, nMinimumHeight);
+              continue;
+            }
+            //root mismatch on the active chain -> desynced; fall through to rebuild below
          }

          //Validate the witness at the witness height
@@ -1145,74 +1172,206 @@ void CWallet::BuildWitnessCache(const CBlockIndex* pindex, bool witnessOnly)
    return;
  }

-  uint256 saplingRoot;
-  CBlockIndex* pblockindex = chainActive[startHeight];
-  int height = chainActive.Height();
  if(fZdebug)
-    LogPrintf("%s: height=%d, startHeight=%d\n", __func__, height, startHeight);
+    LogPrintf("%s: startHeight=%d, tip=%d\n", __func__, startHeight, chainActive.Height());

-  while (pblockindex) {
-    if (ShutdownRequested()) {
-        LogPrintf("%s: shutdown requested, aborting building witnesses\n", __func__);
-        break;
+  // Tier 1 optimization: build the set of notes that still need extension ONCE instead of
+  // rescanning all of mapWallet for every block. A note's gate conditions (nullifier present,
+  // spend depth <= WITNESS_CACHE_SIZE, tx confirmed) are invariant across this loop (the active
+  // chain is fixed under cs_main), so they are evaluated once here rather than per block. A note
+  // becomes "active" at the block where witnessHeight == GetHeight()-1 (exactly the original
+  // per-block gate) and is then extended every subsequent block, so the produced witnesses are
+  // byte-for-byte identical to the original full-rescan implementation -- only the bookkeeping
+  // cost changes from O(blocks * walletSize) to O(blocks + activeNotes).
+  struct PendingNote { int startWitnessHeight; SaplingNoteData* nd; };
+  std::vector<PendingNote> pending;
+  for (std::pair<const uint256, CWalletTx>& wtxItem : mapWallet) {
+    if (wtxItem.second.mapSaplingNoteData.empty())
+      continue;
+    if (wtxItem.second.GetDepthInMainChain() <= 0)
+      continue;
+    for (mapSaplingNoteData_t::value_type& item : wtxItem.second.mapSaplingNoteData) {
+      SaplingNoteData* nd = &(item.second);
+      if (!nd->nullifier)
+        continue;
+      if (nd->witnesses.empty())   // cannot extend (front() would be UB); the original gate also never matched these in practice
+        continue;
+      if (GetSaplingSpendDepth(*nd->nullifier) > WITNESS_CACHE_SIZE)
+        continue;
+      // Only notes that still lag the build target need extension. The lowest such witnessHeight
+      // is exactly startHeight-1 (startHeight = nMinimumHeight+1 from SaplingWitnessMinimumHeight).
+      if (nd->witnessHeight >= startHeight - 1 && nd->witnessHeight <= pindex->GetHeight() - 1)
+        pending.push_back({ nd->witnessHeight, nd });
    }
-    if(pwalletMain->fAbortRescan) {
-        LogPrintf("%s: rescan aborted at block %d, stopping witness building\n", pwalletMain->rescanHeight);
+  }
+  std::sort(pending.begin(), pending.end(),
+            [](const PendingNote& a, const PendingNote& b) { return a.startWitnessHeight < b.startWitnessHeight; });
+
+  // Phase 1 (serial, main thread under cs_main/cs_wallet): extract the per-block Sapling
+  // commitments for [startHeight, tip] into memory. This is the only part touching chain/disk;
+  // profiling showed it is ~1% of rebuild time. ~9MB for a full-chain range.
+  const int tipHeight = pindex->GetHeight();
+  const int rangeLen = tipHeight - startHeight + 1;
+  std::vector<std::vector<uint256>> blockCms(rangeLen > 0 ? rangeLen : 0);
+  int64_t tRead = 0;
+  {
+    int64_t r0 = GetTimeMicros();
+    CBlockIndex* pbi = chainActive[startHeight];
+    while (pbi) {
+      if (ShutdownRequested()) {
+        LogPrintf("%s: shutdown requested, aborting witness rebuild\n", __func__);
+        return;
+      }
+      if (pwalletMain->fAbortRescan) {
+        LogPrintf("%s: rescan aborted during witness rebuild\n", __func__);
        pwalletMain->fRescanning = false;
        return;
-    }
-
-    if (pblockindex->GetHeight() % 100 == 0 && pblockindex->GetHeight() < height - 5) {
-      LogPrintf("Building Witnesses for block %i %.4f complete, %d remaining\n", pblockindex->GetHeight(), pblockindex->GetHeight() / double(height), height - pblockindex->GetHeight() );
-    }
-
-    SaplingMerkleTree saplingTree;
-    saplingRoot = pblockindex->pprev->hashFinalSaplingRoot;
-    pcoinsTip->GetSaplingAnchorAt(saplingRoot, saplingTree);
-
-    //Cycle through blocks and transactions building sapling tree until the commitment needed is reached
-    CBlock block;
-    if (!ReadBlockFromDisk(block, pblockindex, 1)) {
-        throw std::runtime_error(
-           strprintf("Cannot read block height %d (%s) from disk", pindex->GetHeight(), pindex->GetBlockHash().GetHex()));
-    }
-
-    for (std::pair<const uint256, CWalletTx>& wtxItem : mapWallet) {
-
-      if (wtxItem.second.mapSaplingNoteData.empty())
-        continue;
-
-      if (wtxItem.second.GetDepthInMainChain() > 0) {
-
-        //Sapling
-        for (mapSaplingNoteData_t::value_type& item : wtxItem.second.mapSaplingNoteData) {
-          auto* nd = &(item.second);
-          if (nd->nullifier && nd->witnessHeight == pblockindex->GetHeight() - 1
-              && GetSaplingSpendDepth(*item.second.nullifier) <= WITNESS_CACHE_SIZE) {
-
-            nd->witnesses.push_front(nd->witnesses.front());
-            while (nd->witnesses.size() > WITNESS_CACHE_SIZE) {
-                nd->witnesses.pop_back();
-            }
-
-            for (const CTransaction& tx : block.vtx) {
-              for (uint32_t i = 0; i < tx.vShieldedOutput.size(); i++) {
-                const uint256& note_commitment = tx.vShieldedOutput[i].cm;
-                nd->witnesses.front().append(note_commitment);
-              }
-            }
-            nd->witnessHeight = pblockindex->GetHeight();
-          }
-        }
-
      }
+      int h = pbi->GetHeight();
+      if (h % 5000 == 0 && h < tipHeight - 5)
+        LogPrintf("Reading blocks for witness rebuild: %d / %d\n", h - startHeight, rangeLen);
+      CBlock block;
+      if (!ReadBlockFromDisk(block, pbi, 1)) {
+        throw std::runtime_error(strprintf("Cannot read block height %d from disk", h));
+      }
+      std::vector<uint256>& cms = blockCms[h - startHeight];
+      for (const CTransaction& tx : block.vtx)
+        for (uint32_t i = 0; i < tx.vShieldedOutput.size(); i++)
+          cms.push_back(tx.vShieldedOutput[i].cm);
+      if (pbi == pindex) break;
+      pbi = chainActive.Next(pbi);
    }
+    tRead = GetTimeMicros() - r0;
+  }

-    if (pblockindex == pindex)
-      break;
+  // Phase 2 (parallel): each note's witness extension is independent, so partition the lagging
+  // notes across worker threads. Workers touch ONLY their own notes' witness lists plus the
+  // read-only commitment cache -- no locks, no chain access -- while the main thread keeps
+  // cs_main/cs_wallet held throughout. Round-robin assignment over the start-sorted `pending`
+  // spreads the long (low-start) notes across threads. Produces witnesses byte-identical to the
+  // serial path: every note is extended over exactly its [witnessHeight+1, tip] block range, in
+  // order, appending each block's commitments.
+  std::vector<SaplingNoteData*> work;
+  work.reserve(pending.size());
+  for (const PendingNote& p : pending) work.push_back(p.nd);

-    pblockindex = chainActive.Next(pblockindex);
+  // Only a substantial bulk rebuild (e.g. a one-time post-upgrade repair) is worth parallelizing
+  // and logging; routine 1-block tip extension runs serially to avoid per-block thread-spawn
+  // overhead and log spam.
+  const bool bulkRebuild = (rangeLen > 100);

+  int nPar = (int)GetArg("-witnessbuildthreads", 0);
+  if (nPar <= 0) nPar = (int)std::thread::hardware_concurrency();
+  if (nPar <= 0) nPar = 1;
+  if (nPar > (int)work.size()) nPar = (int)work.size();
+  if (nPar < 1) nPar = 1;
+  if (!bulkRebuild) nPar = 1;
+
+  std::atomic<bool> failed(false);
+  std::mutex failMtx;
+  std::string failMsg;
+
+  const int CACHE = (int)WITNESS_CACHE_SIZE;
+  // Tier 2b: advance one witness in place for the deep part (no per-block heap clone), then
+  // materialize only the final CACHE snapshots. -witnessfastrebuild=0 forces the reference
+  // clone-every-block path for A/B verification.
+  bool fastRebuild = GetBoolArg("-witnessfastrebuild", true);
+
+  auto worker = [&](int tid) {
+    try {
+      for (size_t k = (size_t)tid; k < work.size(); k += (size_t)nPar) {
+        SaplingNoteData* nd = work[k];
+        int startH = nd->witnessHeight;
+        int nBlocks = tipHeight - startH;
+        if (nBlocks <= 0)
+          continue;
+
+        if (!fastRebuild || nBlocks <= CACHE) {
+          // Reference / shallow path: clone every block (preserves pre-existing older snapshots).
+          for (int h = startH + 1; h <= tipHeight; h++) {
+            nd->witnesses.push_front(nd->witnesses.front());
+            while ((int)nd->witnesses.size() > CACHE)
+              nd->witnesses.pop_back();
+            const std::vector<uint256>& cms = blockCms[h - startHeight];
+            for (size_t c = 0; c < cms.size(); c++)
+              nd->witnesses.front().append(cms[c]);
+            nd->witnessHeight = h;
+          }
+        } else {
+          // Deep fast path: advance ONE witness in place through [startH+1, tipHeight-CACHE]
+          // with no per-block clone, then build only the final CACHE snapshots. The reference
+          // loop pops all but the last CACHE snapshots, so the resulting deque is identical
+          // ([W@tip .. W@(tip-CACHE+1)]), but with ~CACHE heap allocations instead of ~nBlocks.
+          int deepEnd = tipHeight - CACHE;
+          {
+            SaplingWitness& w = nd->witnesses.front();
+            for (int h = startH + 1; h <= deepEnd; h++) {
+              const std::vector<uint256>& cms = blockCms[h - startHeight];
+              for (size_t c = 0; c < cms.size(); c++)
+                w.append(cms[c]);
+            }
+          }
+          // Drop pre-existing older snapshots; keep only the advanced front (W@deepEnd).
+          while (nd->witnesses.size() > 1)
+            nd->witnesses.pop_back();
+          // Materialize the last CACHE snapshots (heights deepEnd+1 .. tip).
+          for (int h = deepEnd + 1; h <= tipHeight; h++) {
+            nd->witnesses.push_front(nd->witnesses.front());
+            const std::vector<uint256>& cms = blockCms[h - startHeight];
+            for (size_t c = 0; c < cms.size(); c++)
+              nd->witnesses.front().append(cms[c]);
+          }
+          while ((int)nd->witnesses.size() > CACHE)
+            nd->witnesses.pop_back();
+          nd->witnessHeight = tipHeight;
+        }
+      }
+    } catch (const std::exception& e) {
+      std::lock_guard<std::mutex> lk(failMtx);
+      if (failMsg.empty()) failMsg = e.what();
+      failed = true;
+    } catch (...) {
+      failed = true;
+    }
+  };
+
+  int64_t e0 = GetTimeMicros();
+  if (nPar <= 1) {
+    worker(0);
+  } else {
+    std::vector<std::thread> threads;
+    threads.reserve(nPar);
+    for (int t = 0; t < nPar; t++) threads.emplace_back(worker, t);
+    for (std::thread& th : threads) th.join();
+  }
+  int64_t tExtend = GetTimeMicros() - e0;
+
+  if (failed)
+    throw std::runtime_error(std::string("Witness rebuild worker failed: ") + (failMsg.empty() ? "unknown" : failMsg));
+
+  // Latch the rebuilt notes as validated so they are not re-validated and reseeded on every
+  // subsequent block connect. Without this, any note whose witness cannot be reconstructed to
+  // the canonical anchor (e.g. legacy corruption) would be reseeded and fully replayed on every
+  // block forever. A note whose rebuilt root still disagrees with the canonical finalsaplingroot
+  // is unrecoverable here: it is left flagged (the GetSaplingNoteWitnesses majority-anchor guard
+  // skips it for spends) and reported, rather than spun on indefinitely. witnessRootValidated is
+  // in-memory only, so a fresh validation pass still runs on each restart and after any reorg
+  // (DecrementNoteWitnesses clears it), keeping the heal self-correcting.
+  if (!work.empty()) {
+    const uint256& canonicalRoot = pindex->hashFinalSaplingRoot;
+    int nUnrecoverable = 0;
+    for (SaplingNoteData* nd : work) {
+      if (nd->witnesses.empty() || nd->witnesses.front().root() != canonicalRoot)
+        nUnrecoverable++;
+      nd->witnessRootValidated = true;
+    }
+    if (bulkRebuild) {
+      LogPrintf("%s: rebuilt %u note witness cache(s) to height %d in %ldms using %d thread(s)%s\n",
+                __func__, (unsigned)work.size(), tipHeight, (long)((tRead + tExtend) / 1000), nPar,
+                nUnrecoverable
+                  ? strprintf(" [WARNING: %d note(s) could not be rebuilt to the canonical anchor and were skipped]", nUnrecoverable).c_str()
+                  : "");
+    }
  }

 }
@@ -1593,8 +1752,11 @@ bool CWallet::UpdatedNoteData(const CWalletTx& wtxIn, CWalletTx& wtx)
            if (tmp.count(nd.first) && nd.second.witnesses.size() > 0) {
                tmp.at(nd.first).witnesses.assign(
                        nd.second.witnesses.cbegin(), nd.second.witnesses.cend());
+                //PART B2: only carry over witnessHeight TOGETHER with the witnesses it describes.
+                //Copying it unconditionally (when witnesses are NOT copied) advances the height past
+                //the actual witness state for a whole tx's notes at once = the batch desync originator.
+                tmp.at(nd.first).witnessHeight = nd.second.witnessHeight;
            }
-            tmp.at(nd.first).witnessHeight = nd.second.witnessHeight;
        }

        // Now copy over the updated note data
@@ -1833,37 +1995,54 @@ void CWallet::GetSaplingNoteWitnesses(std::vector<SaplingOutPoint> notes,
                                      uint256 &final_anchor)
 {
    LOCK(cs_wallet);
+    witnesses.clear();
    witnesses.resize(notes.size());
-    boost::optional<uint256> rt;
-    int i = 0;
-    for (SaplingOutPoint note : notes) {
-        //fprintf(stderr,"%s: i=%d\n", __func__,i);
-        auto noteData   = mapWallet[note.hash].mapSaplingNoteData;
-        auto nWitnesses = noteData[note].witnesses.size();
-        if (mapWallet.count(note.hash) && noteData.count(note) && nWitnesses > 0) {
-            fprintf(stderr,"%s: Found %lu witnesses for note %s...\n", __func__, nWitnesses, note.hash.ToString().substr(0,8).c_str() );
-            witnesses[i] = noteData[note].witnesses.front();
-            if (!rt) {
-                //fprintf(stderr,"%s: Setting witness root\n",__func__);
-                rt = witnesses[i]->root();
-            } else {
-                if(*rt == witnesses[i]->root()) {
-                } else {
-                    // Something is fucky
-                    std::string err = string("CWallet::GetSaplingNoteWitnesses: Invalid witness root! rt=") + rt.get().ToString();
-                    err            += string("\n!= witness[i]->root()=") + witnesses[i]->root().ToString();
-                    fprintf(stderr,"%s: IGNORING %s\n", __func__,err.c_str());
-                }

-            }
+    // Pass 1: collect each note's most-recent cached witness and tally roots. Use find() so we do
+    // NOT default-construct mapWallet / mapSaplingNoteData entries (the original indexed
+    // mapWallet[note.hash] BEFORE its own count() guard, silently inserting empty entries).
+    std::vector<boost::optional<SaplingWitness>> cand(notes.size());
+    std::map<uint256, int> rootVotes;
+    for (size_t i = 0; i < notes.size(); i++) {
+        const SaplingOutPoint& note = notes[i];
+        auto wi = mapWallet.find(note.hash);
+        if (wi == mapWallet.end())
+            continue;
+        const mapSaplingNoteData_t& noteData = wi->second.mapSaplingNoteData;
+        auto ni = noteData.find(note);
+        if (ni == noteData.end() || ni->second.witnesses.empty())
+            continue;
+        cand[i] = ni->second.witnesses.front();
+        rootVotes[cand[i]->root()]++;
+    }
+
+    // Choose the anchor that the most witnesses agree on (robust even when the FIRST note is the
+    // desynced one - the original code blindly took the first note's root as the anchor).
+    boost::optional<uint256> anchor;
+    int bestVotes = 0;
+    for (const std::pair<uint256, int>& rv : rootVotes) {
+        if (rv.second > bestVotes) { bestVotes = rv.second; anchor = rv.first; }
+    }
+
+    // Pass 2: only emit witnesses whose root matches the common anchor. A desynced witness is left
+    // as boost::none rather than returned: handing the spend prover a witness whose root disagrees
+    // with the anchor guarantees a "Failed to build transaction". Note selection / callers skip
+    // notes that have no usable witness (see asyncrpcoperation_*: "Missing witness for Sapling note").
+    for (size_t i = 0; i < notes.size(); i++) {
+        if (cand[i] && anchor && cand[i]->root() == *anchor) {
+            witnesses[i] = cand[i];
+        } else {
+            if (cand[i])
+                LogPrintf("%s: note %s has a desynced witness (root=%s != anchor=%s); skipping it\n",
+                          __func__, notes[i].hash.ToString().substr(0, 16).c_str(),
+                          cand[i]->root().ToString().c_str(),
+                          anchor ? anchor->ToString().c_str() : "none");
+            witnesses[i] = boost::none;
        }
-        i++;
-    }
-    // All returned witnesses have the same anchor
-    if (rt) {
-        final_anchor = *rt;
-        //fprintf(stderr,"%s: final_anchor=%s\n", __func__, rt.get().ToString().c_str() );
    }
+
+    if (anchor)
+        final_anchor = *anchor;
 }

 isminetype CWallet::IsMine(const CTxIn &txin) const
--- a/src/wallet/wallet.h
+++ b/src/wallet/wallet.h
@@ -311,7 +311,9 @@ public:
    boost::optional<uint256> nullifier;

    //In Memory Only
-    bool witnessRootValidated;
+    // Never serialized (see SerializationOp): must default false so a garbage value can't
+    // read true and short-circuit the witness self-heal in VerifyAndSetInitialWitness.
+    bool witnessRootValidated = false;

    ADD_SERIALIZE_METHODS;