Spaces:

MukulRay
/

recon

Sleeping

App Files Files Community

MukulRay commited on Apr 22

Commit

f95672f

1 Parent(s): b2fc5a3

Phase 1.3: smoke test scripts, eval runner, check_progress, fix unicode in run_eval.py

Browse files

Files changed (4) hide show

eval/check_progress.py +30 -0
eval/run_eval.py +11 -11
eval/run_recon_linear.py +62 -0
eval/smoke_test.py +37 -0

eval/check_progress.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import csv, os, sys
+path = 'eval/results/recon_linear_v2.csv'
+if not os.path.exists(path):
+    print('No results file yet — eval may still be starting up')
+    sys.exit(0)
+with open(path, newline='', encoding='utf-8') as f:
+    rows = list(csv.DictReader(f))
+if not rows:
+    print('File exists but is empty — eval just started')
+    sys.exit(0)
+verdicts = [r.get('critic_verdict', '') for r in rows]
+correct = [r for r in rows if r.get('position_accuracy', '').strip().upper() == 'MATCH']
+contra = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED']
+stale = [r for r in rows if r.get('critic_verdict') == 'STALE']
+print(f"Progress: {len(rows)} / 130 questions done")
+print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }")
+print(f"CONTRADICTED: {len(contra)} ({len(contra)/len(rows)*100:.1f}%)")
+print(f"STALE: {len(stale)} ({len(stale)/len(rows)*100:.1f}%)")
+print(f"Position accuracy: {len(correct)}/{len(rows)} = {len(correct)/len(rows)*100:.1f}%")
+if contra:
+    print(f"\nFirst CONTRADICTED hits:")
+    for r in contra[:5]:
+        print(f"  [{r.get('question_id','?')}] {r.get('question','')[:70]}")

eval/run_eval.py CHANGED Viewed

@@ -130,10 +130,10 @@ def _groq_call_with_backoff(llm: ChatGroq, messages: list) -> str:
             if retry_after > _MAX_WAIT_SECONDS:
                 # Daily token limit — no point waiting
                 print(
-                    f"\n⛔ Groq daily token limit reached "
                     f"(retry-after={retry_after:.0f}s > {_MAX_WAIT_SECONDS}s cap).\n"
                     f"   Results saved so far are intact.\n"
-                    f"   Re-run tomorrow — the harness will resume from where it stopped.\n"
                     f"   Exiting cleanly now."
                 )
                 raise SystemExit(0)
@@ -141,7 +141,7 @@ def _groq_call_with_backoff(llm: ChatGroq, messages: list) -> str:
             # TPM limit — wait and retry
             actual_wait = min(retry_after + 2, _MAX_WAIT_SECONDS)
             print(
-                f"\n⏳ Groq rate limit (attempt {attempt+1}/{_MAX_RETRIES}). "
                 f"Waiting {actual_wait:.0f}s before retry..."
             )
             time.sleep(actual_wait)
@@ -445,7 +445,7 @@ def run_architecture(
     print(f"{'='*60}")
     if not remaining:
-        print("  ✓ Already complete, skipping.")
         return
     f, writer = get_csv_writer(output_path)
@@ -520,9 +520,9 @@ def run_architecture(
                     raise
                 row["position_accuracy"] = score
                 row["judge_reason"]      = reason[:200]
-                print(f"    → verdict={row['critic_verdict']}  judge={score}  {reason[:60]}")
             else:
-                print(f"    → verdict={row['critic_verdict']}  judge=SKIPPED (no position or GT)")
             # ── Staleness catch rate — Category B ────────────────────────
             if category == "B":
@@ -538,7 +538,7 @@ def run_architecture(
     finally:
         f.close()
-    print(f"\n  ✓ {arch_name} complete → {output_path}")
 # ── Summary aggregation ───────────────────────────────────────────────────────
@@ -622,7 +622,7 @@ def compute_summary(results_dir: str) -> None:
             writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
             writer.writeheader()
             writer.writerows(summary_rows)
-        print(f"\n✅ Summary written → {summary_path}")
         print_summary_table(summary_rows)
     else:
         print("\n⚠ No completed result files found to summarise.")
@@ -643,8 +643,8 @@ def print_summary_table(rows: list[dict]) -> None:
             f"  {r['retry_rate']*100:>6.1f}%"
         )
     print("="*90)
-    print("→ staleness_catch_rate and contradiction_catch_rate are your headline resume metrics.")
-    print("→ Copy these numbers into resume bullets after verifying they make sense.")
 # ── Entry point ───────────────────────────────────────────────────────────────
@@ -725,7 +725,7 @@ def main():
     # ── Compute and print summary ─────────────────────────────────────────────
     compute_summary(RESULTS_DIR)
-    print("\n✅ Evaluation complete.")
     print("Next steps:")
     print("  1. Review eval/results/summary.csv for headline metrics")
     print("  2. Run eval/calibration.py to generate calibration curve PNG")

             if retry_after > _MAX_WAIT_SECONDS:
                 # Daily token limit — no point waiting
                 print(
+                    f"\n[STOP] Groq daily token limit reached "
                     f"(retry-after={retry_after:.0f}s > {_MAX_WAIT_SECONDS}s cap).\n"
                     f"   Results saved so far are intact.\n"
+                    f"   Re-run tomorrow -- the harness will resume from where it stopped.\n"
                     f"   Exiting cleanly now."
                 )
                 raise SystemExit(0)
             # TPM limit — wait and retry
             actual_wait = min(retry_after + 2, _MAX_WAIT_SECONDS)
             print(
+                f"\n[WAIT] Groq rate limit (attempt {attempt+1}/{_MAX_RETRIES}). "
                 f"Waiting {actual_wait:.0f}s before retry..."
             )
             time.sleep(actual_wait)
     print(f"{'='*60}")
     if not remaining:
+        print("  [done] Already complete, skipping.")
         return
     f, writer = get_csv_writer(output_path)
                     raise
                 row["position_accuracy"] = score
                 row["judge_reason"]      = reason[:200]
+                print(f"    >> verdict={row['critic_verdict']}  judge={score}  {reason[:60]}")
             else:
+                print(f"    >> verdict={row['critic_verdict']}  judge=SKIPPED (no position or GT)")
             # ── Staleness catch rate — Category B ────────────────────────
             if category == "B":
     finally:
         f.close()
+    print(f"\n  [done] {arch_name} complete -> {output_path}")
 # ── Summary aggregation ───────────────────────────────────────────────────────
             writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
             writer.writeheader()
             writer.writerows(summary_rows)
+        print(f"\n[OK] Summary written -> {summary_path}")
         print_summary_table(summary_rows)
     else:
         print("\n⚠ No completed result files found to summarise.")
             f"  {r['retry_rate']*100:>6.1f}%"
         )
     print("="*90)
+    print(">> staleness_catch_rate and contradiction_catch_rate are your headline resume metrics.")
+    print(">> Copy these numbers into resume bullets after verifying they make sense.")
 # ── Entry point ───────────────────────────────────────────────────────────────
     # ── Compute and print summary ─────────────────────────────────────────────
     compute_summary(RESULTS_DIR)
+    print("\n[OK] Evaluation complete.")
     print("Next steps:")
     print("  1. Review eval/results/summary.csv for headline metrics")
     print("  2. Run eval/calibration.py to generate calibration curve PNG")

eval/run_recon_linear.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import sys, os, json
+sys.path.insert(0, '.')
+from dotenv import load_dotenv
+load_dotenv()
+from eval.run_eval import run_architecture, run_recon_full
+questions = json.load(open('eval/questions.json'))
+ground_truth_list = json.load(open('eval/ground_truth.json'))
+gt = {e['id']: e for e in ground_truth_list}
+OUTPUT = 'eval/results/recon_linear_v2.csv'
+print(f"Running recon_linear v2 -- {len(questions)} questions")
+print(f"Output: {OUTPUT}")
+print(f"Crash-resume: enabled (will skip already-done question IDs)")
+print("=" * 60)
+run_architecture(
+    arch_name='recon_linear_v2',
+    decay_config='linear',
+    runner_fn=run_recon_full,
+    questions=questions,
+    gt_map=gt,
+    output_path=OUTPUT,
+)
+print("\nRun complete. Computing summary...")
+import csv
+with open(OUTPUT) as f:
+    rows = list(csv.DictReader(f))
+verdicts = [r.get('critic_verdict', '') for r in rows]
+verdict_counts = {v: verdicts.count(v) for v in set(verdicts)}
+cat_b = [r for r in rows if r.get('category') == 'B']
+cat_c = [r for r in rows if r.get('category') == 'C']
+stale_rows = [r for r in rows if r.get('critic_verdict', '') == 'STALE']
+contra_rows = [r for r in rows if r.get('critic_verdict', '') == 'CONTRADICTED']
+acc_rows = [r for r in rows if r.get('position_accuracy') in ('MATCH', 'PARTIAL', 'MISMATCH')]
+match_rows = [r for r in acc_rows if r.get('position_accuracy') == 'MATCH']
+staleness_caught = sum(
+    1 for r in cat_b
+    if r.get('critic_verdict', '') in ('STALE', 'CONTRADICTED')
+)
+contradiction_caught = sum(
+    1 for r in cat_c
+    if r.get('critic_verdict', '') == 'CONTRADICTED'
+)
+print(f"\nTotal rows: {len(rows)}")
+print(f"Verdict distribution: {verdict_counts}")
+print(f"\nSTALE: {len(stale_rows)} | CONTRADICTED: {len(contra_rows)}")
+print(f"\nCat B (staleness, n={len(cat_b)}): staleness_caught={staleness_caught} ({staleness_caught/max(len(cat_b),1)*100:.1f}%)")
+print(f"Cat C (contradiction, n={len(cat_c)}): contradiction_caught={contradiction_caught} ({contradiction_caught/max(len(cat_c),1)*100:.1f}%)")
+print(f"\nPosition accuracy (MATCH): {len(match_rows)}/{len(acc_rows)} = {len(match_rows)/max(len(acc_rows),1)*100:.1f}%")
+print(f"\nv1 baseline comparison:")
+print(f"  Contradiction catch rate: v1=0.0%  v2={contradiction_caught/max(len(cat_c),1)*100:.1f}%")
+print(f"  Position accuracy:        v1=43.9% v2={len(match_rows)/max(len(acc_rows),1)*100:.1f}%")

eval/smoke_test.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import sys, os, json
+sys.path.insert(0, '.')
+from dotenv import load_dotenv
+load_dotenv()
+from eval.run_eval import run_architecture, run_recon_full
+questions = json.load(open('eval/questions.json'))
+ground_truth_list = json.load(open('eval/ground_truth.json'))
+gt = {e['id']: e for e in ground_truth_list}
+smoke_qs = questions[:5]
+print(f"Running smoke test: {len(smoke_qs)} questions, recon_linear architecture")
+print("=" * 60)
+run_architecture(
+    arch_name='recon_linear_smoke',
+    decay_config='linear',
+    runner_fn=run_recon_full,
+    questions=smoke_qs,
+    gt_map=gt,
+    output_path='eval/results/recon_linear_smoke.csv',
+)
+print("Smoke test complete. Reading results...")
+import csv
+with open('eval/results/recon_linear_smoke.csv') as f:
+    rows = list(csv.DictReader(f))
+print(f"\n{'ID':<8} {'VERDICT':<15} {'ACCURACY':<10} QUESTION[:60]")
+print("-" * 80)
+for r in rows:
+    print(f"{r.get('question_id','?'):<8} {r.get('critic_verdict','?'):<15} {r.get('position_accuracy','?'):<10} {r.get('question','')[:60]}")
+verdicts = [r.get('critic_verdict', '') for r in rows]
+print(f"\nVerdict counts: { {v: verdicts.count(v) for v in set(verdicts)} }")