Phase 2: add v2 full eval runner script
Browse files- eval/run_v2_eval.py +41 -0
eval/run_v2_eval.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys, os, json
|
| 2 |
+
sys.path.insert(0, '.')
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
load_dotenv()
|
| 5 |
+
from eval.run_eval import run_architecture, run_recon_full
|
| 6 |
+
|
| 7 |
+
questions = json.load(open('eval/questions.json'))
|
| 8 |
+
gt = {e['id']: e for e in json.load(open('eval/ground_truth.json'))}
|
| 9 |
+
|
| 10 |
+
OUTPUT = 'eval/results/recon_linear_v2_full.csv'
|
| 11 |
+
print(f"Running v2 recon_linear -- {len(questions)} questions")
|
| 12 |
+
print(f"Output: {OUTPUT}")
|
| 13 |
+
print(f"Key changes vs v1: edge reliability scorer, OpenAlex augmentation, trust summary")
|
| 14 |
+
print("=" * 60)
|
| 15 |
+
|
| 16 |
+
run_architecture(
|
| 17 |
+
arch_name='recon_linear_v2_full',
|
| 18 |
+
decay_config='linear',
|
| 19 |
+
runner_fn=run_recon_full,
|
| 20 |
+
questions=questions,
|
| 21 |
+
gt_map=gt,
|
| 22 |
+
output_path=OUTPUT,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
import csv
|
| 26 |
+
rows = list(csv.DictReader(open(OUTPUT, encoding='utf-8')))
|
| 27 |
+
verdicts = [r.get('critic_verdict', '') for r in rows]
|
| 28 |
+
match_rows = [r for r in rows if r.get('position_accuracy', '').upper() == 'MATCH']
|
| 29 |
+
stale_rows = [r for r in rows if r.get('critic_verdict') == 'STALE']
|
| 30 |
+
contra_rows = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED']
|
| 31 |
+
cat_b = [r for r in rows if r.get('category') == 'B']
|
| 32 |
+
cat_b_stale = [r for r in cat_b if r.get('staleness_caught', '') == '1']
|
| 33 |
+
|
| 34 |
+
print(f"\n=== v2 FINAL RESULTS ({len(rows)}/130) ===")
|
| 35 |
+
print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }")
|
| 36 |
+
print(f"STALE: {len(stale_rows)}/130 | CONTRADICTED: {len(contra_rows)}/130")
|
| 37 |
+
print(f"Staleness catch rate (Cat B): {len(cat_b_stale)}/{len(cat_b)} = {len(cat_b_stale)/max(len(cat_b),1)*100:.1f}%")
|
| 38 |
+
print(f"Position accuracy (MATCH): {len(match_rows)}/130 = {len(match_rows)/130*100:.1f}%")
|
| 39 |
+
print(f"\n--- vs baselines ---")
|
| 40 |
+
print(f"v1 staleness catch rate: 52.0% | v2: {len(cat_b_stale)/max(len(cat_b),1)*100:.1f}%")
|
| 41 |
+
print(f"v1 position accuracy: 43.9% | v2: {len(match_rows)/130*100:.1f}%")
|