recon / eval /test_catc.py
MukulRay's picture
Phase 1.3: eval results, test scripts, gap filter reverted — no improvement, changelog update
10f9a75
import sys, os, json
sys.path.insert(0, '.')
from dotenv import load_dotenv
load_dotenv()
from eval.run_eval import run_architecture, run_recon_full
questions = json.load(open('eval/questions.json'))
ground_truth_list = json.load(open('eval/ground_truth.json'))
gt = {e['id']: e for e in ground_truth_list}
cat_c = [q for q in questions if q.get('category') == 'C']
print(f"Running {len(cat_c)} Cat C questions with gap filter = 1 year")
run_architecture(
arch_name='recon_catc_gap1yr',
decay_config='linear',
runner_fn=run_recon_full,
questions=cat_c,
gt_map=gt,
output_path='eval/results/recon_catc_gap1yr.csv',
)
import csv
rows = list(csv.DictReader(open('eval/results/recon_catc_gap1yr.csv', encoding='utf-8')))
verdicts = [r.get('critic_verdict', '') for r in rows]
contra = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED']
print(f"\nCat C results ({len(rows)} questions, gap filter = 1yr):")
print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }")
print(f"CONTRADICTED: {len(contra)}/{len(rows)} = {len(contra)/max(len(rows),1)*100:.1f}%")
if contra:
print("CONTRADICTED hits:")
for r in contra:
print(f" [{r.get('question_id','?')}] {r.get('question','')[:70]}")