MukulRay commited on
Commit
dffe992
·
1 Parent(s): dcd26cc

Phase 2: add v2 full eval runner script

Browse files
Files changed (1) hide show
  1. eval/run_v2_eval.py +41 -0
eval/run_v2_eval.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os, json
2
+ sys.path.insert(0, '.')
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+ from eval.run_eval import run_architecture, run_recon_full
6
+
7
+ questions = json.load(open('eval/questions.json'))
8
+ gt = {e['id']: e for e in json.load(open('eval/ground_truth.json'))}
9
+
10
+ OUTPUT = 'eval/results/recon_linear_v2_full.csv'
11
+ print(f"Running v2 recon_linear -- {len(questions)} questions")
12
+ print(f"Output: {OUTPUT}")
13
+ print(f"Key changes vs v1: edge reliability scorer, OpenAlex augmentation, trust summary")
14
+ print("=" * 60)
15
+
16
+ run_architecture(
17
+ arch_name='recon_linear_v2_full',
18
+ decay_config='linear',
19
+ runner_fn=run_recon_full,
20
+ questions=questions,
21
+ gt_map=gt,
22
+ output_path=OUTPUT,
23
+ )
24
+
25
+ import csv
26
+ rows = list(csv.DictReader(open(OUTPUT, encoding='utf-8')))
27
+ verdicts = [r.get('critic_verdict', '') for r in rows]
28
+ match_rows = [r for r in rows if r.get('position_accuracy', '').upper() == 'MATCH']
29
+ stale_rows = [r for r in rows if r.get('critic_verdict') == 'STALE']
30
+ contra_rows = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED']
31
+ cat_b = [r for r in rows if r.get('category') == 'B']
32
+ cat_b_stale = [r for r in cat_b if r.get('staleness_caught', '') == '1']
33
+
34
+ print(f"\n=== v2 FINAL RESULTS ({len(rows)}/130) ===")
35
+ print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }")
36
+ print(f"STALE: {len(stale_rows)}/130 | CONTRADICTED: {len(contra_rows)}/130")
37
+ print(f"Staleness catch rate (Cat B): {len(cat_b_stale)}/{len(cat_b)} = {len(cat_b_stale)/max(len(cat_b),1)*100:.1f}%")
38
+ print(f"Position accuracy (MATCH): {len(match_rows)}/130 = {len(match_rows)/130*100:.1f}%")
39
+ print(f"\n--- vs baselines ---")
40
+ print(f"v1 staleness catch rate: 52.0% | v2: {len(cat_b_stale)/max(len(cat_b),1)*100:.1f}%")
41
+ print(f"v1 position accuracy: 43.9% | v2: {len(match_rows)/130*100:.1f}%")