| import csv, os, sys | |
| path = 'eval/results/recon_linear_v2.csv' | |
| if not os.path.exists(path): | |
| print('No results file yet — eval may still be starting up') | |
| sys.exit(0) | |
| with open(path, newline='', encoding='utf-8') as f: | |
| rows = list(csv.DictReader(f)) | |
| if not rows: | |
| print('File exists but is empty — eval just started') | |
| sys.exit(0) | |
| verdicts = [r.get('critic_verdict', '') for r in rows] | |
| correct = [r for r in rows if r.get('position_accuracy', '').strip().upper() == 'MATCH'] | |
| contra = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED'] | |
| stale = [r for r in rows if r.get('critic_verdict') == 'STALE'] | |
| print(f"Progress: {len(rows)} / 130 questions done") | |
| print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }") | |
| print(f"CONTRADICTED: {len(contra)} ({len(contra)/len(rows)*100:.1f}%)") | |
| print(f"STALE: {len(stale)} ({len(stale)/len(rows)*100:.1f}%)") | |
| print(f"Position accuracy: {len(correct)}/{len(rows)} = {len(correct)/len(rows)*100:.1f}%") | |
| if contra: | |
| print(f"\nFirst CONTRADICTED hits:") | |
| for r in contra[:5]: | |
| print(f" [{r.get('question_id','?')}] {r.get('question','')[:70]}") | |