File size: 2,386 Bytes
f95672f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import sys, os, json
sys.path.insert(0, '.')
from dotenv import load_dotenv
load_dotenv()

from eval.run_eval import run_architecture, run_recon_full

questions = json.load(open('eval/questions.json'))
ground_truth_list = json.load(open('eval/ground_truth.json'))
gt = {e['id']: e for e in ground_truth_list}

OUTPUT = 'eval/results/recon_linear_v2.csv'

print(f"Running recon_linear v2 -- {len(questions)} questions")
print(f"Output: {OUTPUT}")
print(f"Crash-resume: enabled (will skip already-done question IDs)")
print("=" * 60)

run_architecture(
    arch_name='recon_linear_v2',
    decay_config='linear',
    runner_fn=run_recon_full,
    questions=questions,
    gt_map=gt,
    output_path=OUTPUT,
)

print("\nRun complete. Computing summary...")
import csv
with open(OUTPUT) as f:
    rows = list(csv.DictReader(f))

verdicts = [r.get('critic_verdict', '') for r in rows]
verdict_counts = {v: verdicts.count(v) for v in set(verdicts)}

cat_b = [r for r in rows if r.get('category') == 'B']
cat_c = [r for r in rows if r.get('category') == 'C']

stale_rows = [r for r in rows if r.get('critic_verdict', '') == 'STALE']
contra_rows = [r for r in rows if r.get('critic_verdict', '') == 'CONTRADICTED']

acc_rows = [r for r in rows if r.get('position_accuracy') in ('MATCH', 'PARTIAL', 'MISMATCH')]
match_rows = [r for r in acc_rows if r.get('position_accuracy') == 'MATCH']

staleness_caught = sum(
    1 for r in cat_b
    if r.get('critic_verdict', '') in ('STALE', 'CONTRADICTED')
)
contradiction_caught = sum(
    1 for r in cat_c
    if r.get('critic_verdict', '') == 'CONTRADICTED'
)

print(f"\nTotal rows: {len(rows)}")
print(f"Verdict distribution: {verdict_counts}")
print(f"\nSTALE: {len(stale_rows)} | CONTRADICTED: {len(contra_rows)}")
print(f"\nCat B (staleness, n={len(cat_b)}): staleness_caught={staleness_caught} ({staleness_caught/max(len(cat_b),1)*100:.1f}%)")
print(f"Cat C (contradiction, n={len(cat_c)}): contradiction_caught={contradiction_caught} ({contradiction_caught/max(len(cat_c),1)*100:.1f}%)")
print(f"\nPosition accuracy (MATCH): {len(match_rows)}/{len(acc_rows)} = {len(match_rows)/max(len(acc_rows),1)*100:.1f}%")
print(f"\nv1 baseline comparison:")
print(f"  Contradiction catch rate: v1=0.0%  v2={contradiction_caught/max(len(cat_c),1)*100:.1f}%")
print(f"  Position accuracy:        v1=43.9% v2={len(match_rows)/max(len(acc_rows),1)*100:.1f}%")