Spaces:

MukulRay
/

recon

Sleeping

App Files Files Community

recon / eval /run_recon_linear.py

MukulRay

Phase 1.3: smoke test scripts, eval runner, check_progress, fix unicode in run_eval.py

f95672f about 1 month ago

raw

history blame contribute delete

2.39 kB

	import sys, os, json
	sys.path.insert(0, '.')
	from dotenv import load_dotenv
	load_dotenv()

	from eval.run_eval import run_architecture, run_recon_full

	questions = json.load(open('eval/questions.json'))
	ground_truth_list = json.load(open('eval/ground_truth.json'))
	gt = {e['id']: e for e in ground_truth_list}

	OUTPUT = 'eval/results/recon_linear_v2.csv'

	print(f"Running recon_linear v2 -- {len(questions)} questions")
	print(f"Output: {OUTPUT}")
	print(f"Crash-resume: enabled (will skip already-done question IDs)")
	print("=" * 60)

	run_architecture(
	arch_name='recon_linear_v2',
	decay_config='linear',
	runner_fn=run_recon_full,
	questions=questions,
	gt_map=gt,
	output_path=OUTPUT,
	)

	print("\nRun complete. Computing summary...")
	import csv
	with open(OUTPUT) as f:
	rows = list(csv.DictReader(f))

	verdicts = [r.get('critic_verdict', '') for r in rows]
	verdict_counts = {v: verdicts.count(v) for v in set(verdicts)}

	cat_b = [r for r in rows if r.get('category') == 'B']
	cat_c = [r for r in rows if r.get('category') == 'C']

	stale_rows = [r for r in rows if r.get('critic_verdict', '') == 'STALE']
	contra_rows = [r for r in rows if r.get('critic_verdict', '') == 'CONTRADICTED']

	acc_rows = [r for r in rows if r.get('position_accuracy') in ('MATCH', 'PARTIAL', 'MISMATCH')]
	match_rows = [r for r in acc_rows if r.get('position_accuracy') == 'MATCH']

	staleness_caught = sum(
	1 for r in cat_b
	if r.get('critic_verdict', '') in ('STALE', 'CONTRADICTED')
	)
	contradiction_caught = sum(
	1 for r in cat_c
	if r.get('critic_verdict', '') == 'CONTRADICTED'
	)

	print(f"\nTotal rows: {len(rows)}")
	print(f"Verdict distribution: {verdict_counts}")
	print(f"\nSTALE: {len(stale_rows)} \| CONTRADICTED: {len(contra_rows)}")
	print(f"\nCat B (staleness, n={len(cat_b)}): staleness_caught={staleness_caught} ({staleness_caught/max(len(cat_b),1)*100:.1f}%)")
	print(f"Cat C (contradiction, n={len(cat_c)}): contradiction_caught={contradiction_caught} ({contradiction_caught/max(len(cat_c),1)*100:.1f}%)")
	print(f"\nPosition accuracy (MATCH): {len(match_rows)}/{len(acc_rows)} = {len(match_rows)/max(len(acc_rows),1)*100:.1f}%")
	print(f"\nv1 baseline comparison:")
	print(f" Contradiction catch rate: v1=0.0% v2={contradiction_caught/max(len(cat_c),1)*100:.1f}%")
	print(f" Position accuracy: v1=43.9% v2={len(match_rows)/max(len(acc_rows),1)*100:.1f}%")