"""Stage-1 corpus integrity verification.

Compares the first 10,000 decimals of π in pi_10k.txt against an INDEPENDENT
mpmath computation at higher precision (10,500 digits, then truncated).
If the two agree at every sampled position, the corpus is byte-correct.

Also produces a SHA256 checksum for reproducibility.
"""
from __future__ import annotations
import hashlib
import random
import sys
from pathlib import Path

from mpmath import mp

DATA = Path(__file__).parent / "data" / "pi_10k.txt"

# 1. Load corpus
corpus = DATA.read_text(encoding="ascii").strip()
N = len(corpus)
assert N == 10000, f"expected 10,000 digits in pi_10k.txt, got {N}"

# 2. Independent recomputation via mpmath at higher precision
mp.dps = 10500  # ask for extra precision so the first 10,000 are stable
ref = mp.nstr(mp.pi, 10100, strip_zeros=False)
assert ref.startswith("3."), f"unexpected reference format: {ref[:8]!r}"
ref_digits = ref.replace("3.", "", 1)[:N]
assert len(ref_digits) == N

# 3. Full byte-equality check
full_match = corpus == ref_digits

# 4. Random-sample check (additional integrity affirmation)
random.seed(42)
sample_positions = sorted(random.sample(range(1, N + 1), 50))  # 50 positions
sample_results = []
for p in sample_positions:
    sample_results.append((p, corpus[p - 1], ref_digits[p - 1], corpus[p - 1] == ref_digits[p - 1]))
sample_all_ok = all(ok for _, _, _, ok in sample_results)

# 5. Spot check the first 100 decimals against a baked-in canonical
CANONICAL_FIRST_100 = (
    "1415926535"
    "8979323846"
    "2643383279"
    "5028841971"
    "6939937510"
    "5820974944"
    "5923078164"
    "0628620899"
    "8628034825"
    "3421170679"
)
first100_ok = corpus[:100] == CANONICAL_FIRST_100

# 6. SHA256 for reproducibility
sha = hashlib.sha256(corpus.encode("ascii")).hexdigest()

# ---- Report ----
print("=" * 64)
print("Pi-Search · Stage-1 corpus integrity verification")
print("=" * 64)
print(f"File:                  {DATA}")
print(f"Length:                {N:,} decimals")
print(f"Convention:            decimals only (no leading 3); position 1 = '1'")
print(f"First 20 chars:        {corpus[:20]}")
print()
print(f"SHA-256:               {sha}")
print()
print(f"Spot check (first 100): {'PASS' if first100_ok else 'FAIL'}")
print(f"Independent recomputation match (all 10,000 digits): {'PASS' if full_match else 'FAIL'}")
print(f"Random-sample positions checked: {len(sample_positions)}")
print(f"Sample results all match independent reference: {'PASS' if sample_all_ok else 'FAIL'}")
print()
print("Sample positions (showing first 12 of 50):")
for p, c, r, ok in sample_results[:12]:
    print(f"  position {p:>5}:  corpus={c}  ref={r}  match={ok}")
print()
result = full_match and sample_all_ok and first100_ok
print(f"VERDICT: {'ALL CHECKS PASS' if result else 'FAILURE — investigate'}")
sys.exit(0 if result else 1)
