recon / eval /run_recon_linear.py
MukulRay's picture
Phase 1.3: smoke test scripts, eval runner, check_progress, fix unicode in run_eval.py
f95672f
import sys, os, json
sys.path.insert(0, '.')
from dotenv import load_dotenv
load_dotenv()
from eval.run_eval import run_architecture, run_recon_full
questions = json.load(open('eval/questions.json'))
ground_truth_list = json.load(open('eval/ground_truth.json'))
gt = {e['id']: e for e in ground_truth_list}
OUTPUT = 'eval/results/recon_linear_v2.csv'
print(f"Running recon_linear v2 -- {len(questions)} questions")
print(f"Output: {OUTPUT}")
print(f"Crash-resume: enabled (will skip already-done question IDs)")
print("=" * 60)
run_architecture(
arch_name='recon_linear_v2',
decay_config='linear',
runner_fn=run_recon_full,
questions=questions,
gt_map=gt,
output_path=OUTPUT,
)
print("\nRun complete. Computing summary...")
import csv
with open(OUTPUT) as f:
rows = list(csv.DictReader(f))
verdicts = [r.get('critic_verdict', '') for r in rows]
verdict_counts = {v: verdicts.count(v) for v in set(verdicts)}
cat_b = [r for r in rows if r.get('category') == 'B']
cat_c = [r for r in rows if r.get('category') == 'C']
stale_rows = [r for r in rows if r.get('critic_verdict', '') == 'STALE']
contra_rows = [r for r in rows if r.get('critic_verdict', '') == 'CONTRADICTED']
acc_rows = [r for r in rows if r.get('position_accuracy') in ('MATCH', 'PARTIAL', 'MISMATCH')]
match_rows = [r for r in acc_rows if r.get('position_accuracy') == 'MATCH']
staleness_caught = sum(
1 for r in cat_b
if r.get('critic_verdict', '') in ('STALE', 'CONTRADICTED')
)
contradiction_caught = sum(
1 for r in cat_c
if r.get('critic_verdict', '') == 'CONTRADICTED'
)
print(f"\nTotal rows: {len(rows)}")
print(f"Verdict distribution: {verdict_counts}")
print(f"\nSTALE: {len(stale_rows)} | CONTRADICTED: {len(contra_rows)}")
print(f"\nCat B (staleness, n={len(cat_b)}): staleness_caught={staleness_caught} ({staleness_caught/max(len(cat_b),1)*100:.1f}%)")
print(f"Cat C (contradiction, n={len(cat_c)}): contradiction_caught={contradiction_caught} ({contradiction_caught/max(len(cat_c),1)*100:.1f}%)")
print(f"\nPosition accuracy (MATCH): {len(match_rows)}/{len(acc_rows)} = {len(match_rows)/max(len(acc_rows),1)*100:.1f}%")
print(f"\nv1 baseline comparison:")
print(f" Contradiction catch rate: v1=0.0% v2={contradiction_caught/max(len(cat_c),1)*100:.1f}%")
print(f" Position accuracy: v1=43.9% v2={len(match_rows)/max(len(acc_rows),1)*100:.1f}%")