import sys, os, json sys.path.insert(0, '.') from dotenv import load_dotenv load_dotenv() from eval.run_eval import run_architecture, run_recon_full questions = json.load(open('eval/questions.json')) ground_truth_list = json.load(open('eval/ground_truth.json')) gt = {e['id']: e for e in ground_truth_list} cat_c = [q for q in questions if q.get('category') == 'C'] print(f"Running {len(cat_c)} Cat C questions with gap filter = 1 year") run_architecture( arch_name='recon_catc_gap1yr', decay_config='linear', runner_fn=run_recon_full, questions=cat_c, gt_map=gt, output_path='eval/results/recon_catc_gap1yr.csv', ) import csv rows = list(csv.DictReader(open('eval/results/recon_catc_gap1yr.csv', encoding='utf-8'))) verdicts = [r.get('critic_verdict', '') for r in rows] contra = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED'] print(f"\nCat C results ({len(rows)} questions, gap filter = 1yr):") print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }") print(f"CONTRADICTED: {len(contra)}/{len(rows)} = {len(contra)/max(len(rows),1)*100:.1f}%") if contra: print("CONTRADICTED hits:") for r in contra: print(f" [{r.get('question_id','?')}] {r.get('question','')[:70]}")