File size: 1,264 Bytes
10f9a75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import sys, os, json
sys.path.insert(0, '.')
from dotenv import load_dotenv
load_dotenv()
from eval.run_eval import run_architecture, run_recon_full

questions = json.load(open('eval/questions.json'))
ground_truth_list = json.load(open('eval/ground_truth.json'))
gt = {e['id']: e for e in ground_truth_list}

cat_c = [q for q in questions if q.get('category') == 'C']
print(f"Running {len(cat_c)} Cat C questions with gap filter = 1 year")

run_architecture(
    arch_name='recon_catc_gap1yr',
    decay_config='linear',
    runner_fn=run_recon_full,
    questions=cat_c,
    gt_map=gt,
    output_path='eval/results/recon_catc_gap1yr.csv',
)

import csv
rows = list(csv.DictReader(open('eval/results/recon_catc_gap1yr.csv', encoding='utf-8')))
verdicts = [r.get('critic_verdict', '') for r in rows]
contra = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED']
print(f"\nCat C results ({len(rows)} questions, gap filter = 1yr):")
print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }")
print(f"CONTRADICTED: {len(contra)}/{len(rows)} = {len(contra)/max(len(rows),1)*100:.1f}%")
if contra:
    print("CONTRADICTED hits:")
    for r in contra:
        print(f"  [{r.get('question_id','?')}] {r.get('question','')[:70]}")