| import sys, os, json |
| sys.path.insert(0, '.') |
| from dotenv import load_dotenv |
| load_dotenv() |
|
|
| from eval.run_eval import run_architecture, run_recon_full |
|
|
| questions = json.load(open('eval/questions.json')) |
| ground_truth_list = json.load(open('eval/ground_truth.json')) |
| gt = {e['id']: e for e in ground_truth_list} |
|
|
| OUTPUT = 'eval/results/recon_linear_v2.csv' |
|
|
| print(f"Running recon_linear v2 -- {len(questions)} questions") |
| print(f"Output: {OUTPUT}") |
| print(f"Crash-resume: enabled (will skip already-done question IDs)") |
| print("=" * 60) |
|
|
| run_architecture( |
| arch_name='recon_linear_v2', |
| decay_config='linear', |
| runner_fn=run_recon_full, |
| questions=questions, |
| gt_map=gt, |
| output_path=OUTPUT, |
| ) |
|
|
| print("\nRun complete. Computing summary...") |
| import csv |
| with open(OUTPUT) as f: |
| rows = list(csv.DictReader(f)) |
|
|
| verdicts = [r.get('critic_verdict', '') for r in rows] |
| verdict_counts = {v: verdicts.count(v) for v in set(verdicts)} |
|
|
| cat_b = [r for r in rows if r.get('category') == 'B'] |
| cat_c = [r for r in rows if r.get('category') == 'C'] |
|
|
| stale_rows = [r for r in rows if r.get('critic_verdict', '') == 'STALE'] |
| contra_rows = [r for r in rows if r.get('critic_verdict', '') == 'CONTRADICTED'] |
|
|
| acc_rows = [r for r in rows if r.get('position_accuracy') in ('MATCH', 'PARTIAL', 'MISMATCH')] |
| match_rows = [r for r in acc_rows if r.get('position_accuracy') == 'MATCH'] |
|
|
| staleness_caught = sum( |
| 1 for r in cat_b |
| if r.get('critic_verdict', '') in ('STALE', 'CONTRADICTED') |
| ) |
| contradiction_caught = sum( |
| 1 for r in cat_c |
| if r.get('critic_verdict', '') == 'CONTRADICTED' |
| ) |
|
|
| print(f"\nTotal rows: {len(rows)}") |
| print(f"Verdict distribution: {verdict_counts}") |
| print(f"\nSTALE: {len(stale_rows)} | CONTRADICTED: {len(contra_rows)}") |
| print(f"\nCat B (staleness, n={len(cat_b)}): staleness_caught={staleness_caught} ({staleness_caught/max(len(cat_b),1)*100:.1f}%)") |
| print(f"Cat C (contradiction, n={len(cat_c)}): contradiction_caught={contradiction_caught} ({contradiction_caught/max(len(cat_c),1)*100:.1f}%)") |
| print(f"\nPosition accuracy (MATCH): {len(match_rows)}/{len(acc_rows)} = {len(match_rows)/max(len(acc_rows),1)*100:.1f}%") |
| print(f"\nv1 baseline comparison:") |
| print(f" Contradiction catch rate: v1=0.0% v2={contradiction_caught/max(len(cat_c),1)*100:.1f}%") |
| print(f" Position accuracy: v1=43.9% v2={len(match_rows)/max(len(acc_rows),1)*100:.1f}%") |
|
|