Spaces:

MukulRay
/

recon

Sleeping

recon / eval /test_catc.py

Phase 1.3: eval results, test scripts, gap filter reverted — no improvement, changelog update

10f9a75 27 days ago

1.26 kB

	import sys, os, json
	sys.path.insert(0, '.')
	from dotenv import load_dotenv
	load_dotenv()
	from eval.run_eval import run_architecture, run_recon_full

	questions = json.load(open('eval/questions.json'))
	ground_truth_list = json.load(open('eval/ground_truth.json'))
	gt = {e['id']: e for e in ground_truth_list}

	cat_c = [q for q in questions if q.get('category') == 'C']
	print(f"Running {len(cat_c)} Cat C questions with gap filter = 1 year")

	run_architecture(
	arch_name='recon_catc_gap1yr',
	decay_config='linear',
	runner_fn=run_recon_full,
	questions=cat_c,
	gt_map=gt,
	output_path='eval/results/recon_catc_gap1yr.csv',
	)

	import csv
	rows = list(csv.DictReader(open('eval/results/recon_catc_gap1yr.csv', encoding='utf-8')))
	verdicts = [r.get('critic_verdict', '') for r in rows]
	contra = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED']
	print(f"\nCat C results ({len(rows)} questions, gap filter = 1yr):")
	print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }")
	print(f"CONTRADICTED: {len(contra)}/{len(rows)} = {len(contra)/max(len(rows),1)*100:.1f}%")
	if contra:
	print("CONTRADICTED hits:")
	for r in contra:
	print(f" [{r.get('question_id','?')}] {r.get('question','')[:70]}")