Phase 1.3: smoke test scripts, eval runner, check_progress, fix unicode in run_eval.py
Browse files- eval/check_progress.py +30 -0
- eval/run_eval.py +11 -11
- eval/run_recon_linear.py +62 -0
- eval/smoke_test.py +37 -0
eval/check_progress.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv, os, sys
|
| 2 |
+
|
| 3 |
+
path = 'eval/results/recon_linear_v2.csv'
|
| 4 |
+
|
| 5 |
+
if not os.path.exists(path):
|
| 6 |
+
print('No results file yet — eval may still be starting up')
|
| 7 |
+
sys.exit(0)
|
| 8 |
+
|
| 9 |
+
with open(path, newline='', encoding='utf-8') as f:
|
| 10 |
+
rows = list(csv.DictReader(f))
|
| 11 |
+
|
| 12 |
+
if not rows:
|
| 13 |
+
print('File exists but is empty — eval just started')
|
| 14 |
+
sys.exit(0)
|
| 15 |
+
|
| 16 |
+
verdicts = [r.get('critic_verdict', '') for r in rows]
|
| 17 |
+
correct = [r for r in rows if r.get('position_accuracy', '').strip().upper() == 'MATCH']
|
| 18 |
+
contra = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED']
|
| 19 |
+
stale = [r for r in rows if r.get('critic_verdict') == 'STALE']
|
| 20 |
+
|
| 21 |
+
print(f"Progress: {len(rows)} / 130 questions done")
|
| 22 |
+
print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }")
|
| 23 |
+
print(f"CONTRADICTED: {len(contra)} ({len(contra)/len(rows)*100:.1f}%)")
|
| 24 |
+
print(f"STALE: {len(stale)} ({len(stale)/len(rows)*100:.1f}%)")
|
| 25 |
+
print(f"Position accuracy: {len(correct)}/{len(rows)} = {len(correct)/len(rows)*100:.1f}%")
|
| 26 |
+
|
| 27 |
+
if contra:
|
| 28 |
+
print(f"\nFirst CONTRADICTED hits:")
|
| 29 |
+
for r in contra[:5]:
|
| 30 |
+
print(f" [{r.get('question_id','?')}] {r.get('question','')[:70]}")
|
eval/run_eval.py
CHANGED
|
@@ -130,10 +130,10 @@ def _groq_call_with_backoff(llm: ChatGroq, messages: list) -> str:
|
|
| 130 |
if retry_after > _MAX_WAIT_SECONDS:
|
| 131 |
# Daily token limit — no point waiting
|
| 132 |
print(
|
| 133 |
-
f"\n
|
| 134 |
f"(retry-after={retry_after:.0f}s > {_MAX_WAIT_SECONDS}s cap).\n"
|
| 135 |
f" Results saved so far are intact.\n"
|
| 136 |
-
f" Re-run tomorrow
|
| 137 |
f" Exiting cleanly now."
|
| 138 |
)
|
| 139 |
raise SystemExit(0)
|
|
@@ -141,7 +141,7 @@ def _groq_call_with_backoff(llm: ChatGroq, messages: list) -> str:
|
|
| 141 |
# TPM limit — wait and retry
|
| 142 |
actual_wait = min(retry_after + 2, _MAX_WAIT_SECONDS)
|
| 143 |
print(
|
| 144 |
-
f"\n
|
| 145 |
f"Waiting {actual_wait:.0f}s before retry..."
|
| 146 |
)
|
| 147 |
time.sleep(actual_wait)
|
|
@@ -445,7 +445,7 @@ def run_architecture(
|
|
| 445 |
print(f"{'='*60}")
|
| 446 |
|
| 447 |
if not remaining:
|
| 448 |
-
print("
|
| 449 |
return
|
| 450 |
|
| 451 |
f, writer = get_csv_writer(output_path)
|
|
@@ -520,9 +520,9 @@ def run_architecture(
|
|
| 520 |
raise
|
| 521 |
row["position_accuracy"] = score
|
| 522 |
row["judge_reason"] = reason[:200]
|
| 523 |
-
print(f"
|
| 524 |
else:
|
| 525 |
-
print(f"
|
| 526 |
|
| 527 |
# ── Staleness catch rate — Category B ────────────────────────
|
| 528 |
if category == "B":
|
|
@@ -538,7 +538,7 @@ def run_architecture(
|
|
| 538 |
finally:
|
| 539 |
f.close()
|
| 540 |
|
| 541 |
-
print(f"\n
|
| 542 |
|
| 543 |
|
| 544 |
# ── Summary aggregation ───────────────────────────────────────────────────────
|
|
@@ -622,7 +622,7 @@ def compute_summary(results_dir: str) -> None:
|
|
| 622 |
writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
|
| 623 |
writer.writeheader()
|
| 624 |
writer.writerows(summary_rows)
|
| 625 |
-
print(f"\n
|
| 626 |
print_summary_table(summary_rows)
|
| 627 |
else:
|
| 628 |
print("\n⚠ No completed result files found to summarise.")
|
|
@@ -643,8 +643,8 @@ def print_summary_table(rows: list[dict]) -> None:
|
|
| 643 |
f" {r['retry_rate']*100:>6.1f}%"
|
| 644 |
)
|
| 645 |
print("="*90)
|
| 646 |
-
print("
|
| 647 |
-
print("
|
| 648 |
|
| 649 |
|
| 650 |
# ── Entry point ───────────────────────────────────────────────────────────────
|
|
@@ -725,7 +725,7 @@ def main():
|
|
| 725 |
# ── Compute and print summary ─────────────────────────────────────────────
|
| 726 |
compute_summary(RESULTS_DIR)
|
| 727 |
|
| 728 |
-
print("\n
|
| 729 |
print("Next steps:")
|
| 730 |
print(" 1. Review eval/results/summary.csv for headline metrics")
|
| 731 |
print(" 2. Run eval/calibration.py to generate calibration curve PNG")
|
|
|
|
| 130 |
if retry_after > _MAX_WAIT_SECONDS:
|
| 131 |
# Daily token limit — no point waiting
|
| 132 |
print(
|
| 133 |
+
f"\n[STOP] Groq daily token limit reached "
|
| 134 |
f"(retry-after={retry_after:.0f}s > {_MAX_WAIT_SECONDS}s cap).\n"
|
| 135 |
f" Results saved so far are intact.\n"
|
| 136 |
+
f" Re-run tomorrow -- the harness will resume from where it stopped.\n"
|
| 137 |
f" Exiting cleanly now."
|
| 138 |
)
|
| 139 |
raise SystemExit(0)
|
|
|
|
| 141 |
# TPM limit — wait and retry
|
| 142 |
actual_wait = min(retry_after + 2, _MAX_WAIT_SECONDS)
|
| 143 |
print(
|
| 144 |
+
f"\n[WAIT] Groq rate limit (attempt {attempt+1}/{_MAX_RETRIES}). "
|
| 145 |
f"Waiting {actual_wait:.0f}s before retry..."
|
| 146 |
)
|
| 147 |
time.sleep(actual_wait)
|
|
|
|
| 445 |
print(f"{'='*60}")
|
| 446 |
|
| 447 |
if not remaining:
|
| 448 |
+
print(" [done] Already complete, skipping.")
|
| 449 |
return
|
| 450 |
|
| 451 |
f, writer = get_csv_writer(output_path)
|
|
|
|
| 520 |
raise
|
| 521 |
row["position_accuracy"] = score
|
| 522 |
row["judge_reason"] = reason[:200]
|
| 523 |
+
print(f" >> verdict={row['critic_verdict']} judge={score} {reason[:60]}")
|
| 524 |
else:
|
| 525 |
+
print(f" >> verdict={row['critic_verdict']} judge=SKIPPED (no position or GT)")
|
| 526 |
|
| 527 |
# ── Staleness catch rate — Category B ────────────────────────
|
| 528 |
if category == "B":
|
|
|
|
| 538 |
finally:
|
| 539 |
f.close()
|
| 540 |
|
| 541 |
+
print(f"\n [done] {arch_name} complete -> {output_path}")
|
| 542 |
|
| 543 |
|
| 544 |
# ── Summary aggregation ───────────────────────────────────────────────────────
|
|
|
|
| 622 |
writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
|
| 623 |
writer.writeheader()
|
| 624 |
writer.writerows(summary_rows)
|
| 625 |
+
print(f"\n[OK] Summary written -> {summary_path}")
|
| 626 |
print_summary_table(summary_rows)
|
| 627 |
else:
|
| 628 |
print("\n⚠ No completed result files found to summarise.")
|
|
|
|
| 643 |
f" {r['retry_rate']*100:>6.1f}%"
|
| 644 |
)
|
| 645 |
print("="*90)
|
| 646 |
+
print(">> staleness_catch_rate and contradiction_catch_rate are your headline resume metrics.")
|
| 647 |
+
print(">> Copy these numbers into resume bullets after verifying they make sense.")
|
| 648 |
|
| 649 |
|
| 650 |
# ── Entry point ───────────────────────────────────────────────────────────────
|
|
|
|
| 725 |
# ── Compute and print summary ─────────────────────────────────────────────
|
| 726 |
compute_summary(RESULTS_DIR)
|
| 727 |
|
| 728 |
+
print("\n[OK] Evaluation complete.")
|
| 729 |
print("Next steps:")
|
| 730 |
print(" 1. Review eval/results/summary.csv for headline metrics")
|
| 731 |
print(" 2. Run eval/calibration.py to generate calibration curve PNG")
|
eval/run_recon_linear.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys, os, json
|
| 2 |
+
sys.path.insert(0, '.')
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
from eval.run_eval import run_architecture, run_recon_full
|
| 7 |
+
|
| 8 |
+
questions = json.load(open('eval/questions.json'))
|
| 9 |
+
ground_truth_list = json.load(open('eval/ground_truth.json'))
|
| 10 |
+
gt = {e['id']: e for e in ground_truth_list}
|
| 11 |
+
|
| 12 |
+
OUTPUT = 'eval/results/recon_linear_v2.csv'
|
| 13 |
+
|
| 14 |
+
print(f"Running recon_linear v2 -- {len(questions)} questions")
|
| 15 |
+
print(f"Output: {OUTPUT}")
|
| 16 |
+
print(f"Crash-resume: enabled (will skip already-done question IDs)")
|
| 17 |
+
print("=" * 60)
|
| 18 |
+
|
| 19 |
+
run_architecture(
|
| 20 |
+
arch_name='recon_linear_v2',
|
| 21 |
+
decay_config='linear',
|
| 22 |
+
runner_fn=run_recon_full,
|
| 23 |
+
questions=questions,
|
| 24 |
+
gt_map=gt,
|
| 25 |
+
output_path=OUTPUT,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
print("\nRun complete. Computing summary...")
|
| 29 |
+
import csv
|
| 30 |
+
with open(OUTPUT) as f:
|
| 31 |
+
rows = list(csv.DictReader(f))
|
| 32 |
+
|
| 33 |
+
verdicts = [r.get('critic_verdict', '') for r in rows]
|
| 34 |
+
verdict_counts = {v: verdicts.count(v) for v in set(verdicts)}
|
| 35 |
+
|
| 36 |
+
cat_b = [r for r in rows if r.get('category') == 'B']
|
| 37 |
+
cat_c = [r for r in rows if r.get('category') == 'C']
|
| 38 |
+
|
| 39 |
+
stale_rows = [r for r in rows if r.get('critic_verdict', '') == 'STALE']
|
| 40 |
+
contra_rows = [r for r in rows if r.get('critic_verdict', '') == 'CONTRADICTED']
|
| 41 |
+
|
| 42 |
+
acc_rows = [r for r in rows if r.get('position_accuracy') in ('MATCH', 'PARTIAL', 'MISMATCH')]
|
| 43 |
+
match_rows = [r for r in acc_rows if r.get('position_accuracy') == 'MATCH']
|
| 44 |
+
|
| 45 |
+
staleness_caught = sum(
|
| 46 |
+
1 for r in cat_b
|
| 47 |
+
if r.get('critic_verdict', '') in ('STALE', 'CONTRADICTED')
|
| 48 |
+
)
|
| 49 |
+
contradiction_caught = sum(
|
| 50 |
+
1 for r in cat_c
|
| 51 |
+
if r.get('critic_verdict', '') == 'CONTRADICTED'
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
print(f"\nTotal rows: {len(rows)}")
|
| 55 |
+
print(f"Verdict distribution: {verdict_counts}")
|
| 56 |
+
print(f"\nSTALE: {len(stale_rows)} | CONTRADICTED: {len(contra_rows)}")
|
| 57 |
+
print(f"\nCat B (staleness, n={len(cat_b)}): staleness_caught={staleness_caught} ({staleness_caught/max(len(cat_b),1)*100:.1f}%)")
|
| 58 |
+
print(f"Cat C (contradiction, n={len(cat_c)}): contradiction_caught={contradiction_caught} ({contradiction_caught/max(len(cat_c),1)*100:.1f}%)")
|
| 59 |
+
print(f"\nPosition accuracy (MATCH): {len(match_rows)}/{len(acc_rows)} = {len(match_rows)/max(len(acc_rows),1)*100:.1f}%")
|
| 60 |
+
print(f"\nv1 baseline comparison:")
|
| 61 |
+
print(f" Contradiction catch rate: v1=0.0% v2={contradiction_caught/max(len(cat_c),1)*100:.1f}%")
|
| 62 |
+
print(f" Position accuracy: v1=43.9% v2={len(match_rows)/max(len(acc_rows),1)*100:.1f}%")
|
eval/smoke_test.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys, os, json
|
| 2 |
+
sys.path.insert(0, '.')
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
from eval.run_eval import run_architecture, run_recon_full
|
| 7 |
+
|
| 8 |
+
questions = json.load(open('eval/questions.json'))
|
| 9 |
+
ground_truth_list = json.load(open('eval/ground_truth.json'))
|
| 10 |
+
gt = {e['id']: e for e in ground_truth_list}
|
| 11 |
+
|
| 12 |
+
smoke_qs = questions[:5]
|
| 13 |
+
|
| 14 |
+
print(f"Running smoke test: {len(smoke_qs)} questions, recon_linear architecture")
|
| 15 |
+
print("=" * 60)
|
| 16 |
+
|
| 17 |
+
run_architecture(
|
| 18 |
+
arch_name='recon_linear_smoke',
|
| 19 |
+
decay_config='linear',
|
| 20 |
+
runner_fn=run_recon_full,
|
| 21 |
+
questions=smoke_qs,
|
| 22 |
+
gt_map=gt,
|
| 23 |
+
output_path='eval/results/recon_linear_smoke.csv',
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
print("Smoke test complete. Reading results...")
|
| 27 |
+
import csv
|
| 28 |
+
with open('eval/results/recon_linear_smoke.csv') as f:
|
| 29 |
+
rows = list(csv.DictReader(f))
|
| 30 |
+
|
| 31 |
+
print(f"\n{'ID':<8} {'VERDICT':<15} {'ACCURACY':<10} QUESTION[:60]")
|
| 32 |
+
print("-" * 80)
|
| 33 |
+
for r in rows:
|
| 34 |
+
print(f"{r.get('question_id','?'):<8} {r.get('critic_verdict','?'):<15} {r.get('position_accuracy','?'):<10} {r.get('question','')[:60]}")
|
| 35 |
+
|
| 36 |
+
verdicts = [r.get('critic_verdict', '') for r in rows]
|
| 37 |
+
print(f"\nVerdict counts: { {v: verdicts.count(v) for v in set(verdicts)} }")
|