MukulRay commited on
Commit
f95672f
·
1 Parent(s): b2fc5a3

Phase 1.3: smoke test scripts, eval runner, check_progress, fix unicode in run_eval.py

Browse files
eval/check_progress.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv, os, sys
2
+
3
+ path = 'eval/results/recon_linear_v2.csv'
4
+
5
+ if not os.path.exists(path):
6
+ print('No results file yet — eval may still be starting up')
7
+ sys.exit(0)
8
+
9
+ with open(path, newline='', encoding='utf-8') as f:
10
+ rows = list(csv.DictReader(f))
11
+
12
+ if not rows:
13
+ print('File exists but is empty — eval just started')
14
+ sys.exit(0)
15
+
16
+ verdicts = [r.get('critic_verdict', '') for r in rows]
17
+ correct = [r for r in rows if r.get('position_accuracy', '').strip().upper() == 'MATCH']
18
+ contra = [r for r in rows if r.get('critic_verdict') == 'CONTRADICTED']
19
+ stale = [r for r in rows if r.get('critic_verdict') == 'STALE']
20
+
21
+ print(f"Progress: {len(rows)} / 130 questions done")
22
+ print(f"Verdict distribution: { {v: verdicts.count(v) for v in sorted(set(verdicts))} }")
23
+ print(f"CONTRADICTED: {len(contra)} ({len(contra)/len(rows)*100:.1f}%)")
24
+ print(f"STALE: {len(stale)} ({len(stale)/len(rows)*100:.1f}%)")
25
+ print(f"Position accuracy: {len(correct)}/{len(rows)} = {len(correct)/len(rows)*100:.1f}%")
26
+
27
+ if contra:
28
+ print(f"\nFirst CONTRADICTED hits:")
29
+ for r in contra[:5]:
30
+ print(f" [{r.get('question_id','?')}] {r.get('question','')[:70]}")
eval/run_eval.py CHANGED
@@ -130,10 +130,10 @@ def _groq_call_with_backoff(llm: ChatGroq, messages: list) -> str:
130
  if retry_after > _MAX_WAIT_SECONDS:
131
  # Daily token limit — no point waiting
132
  print(
133
- f"\n Groq daily token limit reached "
134
  f"(retry-after={retry_after:.0f}s > {_MAX_WAIT_SECONDS}s cap).\n"
135
  f" Results saved so far are intact.\n"
136
- f" Re-run tomorrow the harness will resume from where it stopped.\n"
137
  f" Exiting cleanly now."
138
  )
139
  raise SystemExit(0)
@@ -141,7 +141,7 @@ def _groq_call_with_backoff(llm: ChatGroq, messages: list) -> str:
141
  # TPM limit — wait and retry
142
  actual_wait = min(retry_after + 2, _MAX_WAIT_SECONDS)
143
  print(
144
- f"\n Groq rate limit (attempt {attempt+1}/{_MAX_RETRIES}). "
145
  f"Waiting {actual_wait:.0f}s before retry..."
146
  )
147
  time.sleep(actual_wait)
@@ -445,7 +445,7 @@ def run_architecture(
445
  print(f"{'='*60}")
446
 
447
  if not remaining:
448
- print(" Already complete, skipping.")
449
  return
450
 
451
  f, writer = get_csv_writer(output_path)
@@ -520,9 +520,9 @@ def run_architecture(
520
  raise
521
  row["position_accuracy"] = score
522
  row["judge_reason"] = reason[:200]
523
- print(f" verdict={row['critic_verdict']} judge={score} {reason[:60]}")
524
  else:
525
- print(f" verdict={row['critic_verdict']} judge=SKIPPED (no position or GT)")
526
 
527
  # ── Staleness catch rate — Category B ────────────────────────
528
  if category == "B":
@@ -538,7 +538,7 @@ def run_architecture(
538
  finally:
539
  f.close()
540
 
541
- print(f"\n {arch_name} complete {output_path}")
542
 
543
 
544
  # ── Summary aggregation ───────────────────────────────────────────────────────
@@ -622,7 +622,7 @@ def compute_summary(results_dir: str) -> None:
622
  writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
623
  writer.writeheader()
624
  writer.writerows(summary_rows)
625
- print(f"\n Summary written {summary_path}")
626
  print_summary_table(summary_rows)
627
  else:
628
  print("\n⚠ No completed result files found to summarise.")
@@ -643,8 +643,8 @@ def print_summary_table(rows: list[dict]) -> None:
643
  f" {r['retry_rate']*100:>6.1f}%"
644
  )
645
  print("="*90)
646
- print(" staleness_catch_rate and contradiction_catch_rate are your headline resume metrics.")
647
- print(" Copy these numbers into resume bullets after verifying they make sense.")
648
 
649
 
650
  # ── Entry point ───────────────────────────────────────────────────────────────
@@ -725,7 +725,7 @@ def main():
725
  # ── Compute and print summary ─────────────────────────────────────────────
726
  compute_summary(RESULTS_DIR)
727
 
728
- print("\n Evaluation complete.")
729
  print("Next steps:")
730
  print(" 1. Review eval/results/summary.csv for headline metrics")
731
  print(" 2. Run eval/calibration.py to generate calibration curve PNG")
 
130
  if retry_after > _MAX_WAIT_SECONDS:
131
  # Daily token limit — no point waiting
132
  print(
133
+ f"\n[STOP] Groq daily token limit reached "
134
  f"(retry-after={retry_after:.0f}s > {_MAX_WAIT_SECONDS}s cap).\n"
135
  f" Results saved so far are intact.\n"
136
+ f" Re-run tomorrow -- the harness will resume from where it stopped.\n"
137
  f" Exiting cleanly now."
138
  )
139
  raise SystemExit(0)
 
141
  # TPM limit — wait and retry
142
  actual_wait = min(retry_after + 2, _MAX_WAIT_SECONDS)
143
  print(
144
+ f"\n[WAIT] Groq rate limit (attempt {attempt+1}/{_MAX_RETRIES}). "
145
  f"Waiting {actual_wait:.0f}s before retry..."
146
  )
147
  time.sleep(actual_wait)
 
445
  print(f"{'='*60}")
446
 
447
  if not remaining:
448
+ print(" [done] Already complete, skipping.")
449
  return
450
 
451
  f, writer = get_csv_writer(output_path)
 
520
  raise
521
  row["position_accuracy"] = score
522
  row["judge_reason"] = reason[:200]
523
+ print(f" >> verdict={row['critic_verdict']} judge={score} {reason[:60]}")
524
  else:
525
+ print(f" >> verdict={row['critic_verdict']} judge=SKIPPED (no position or GT)")
526
 
527
  # ── Staleness catch rate — Category B ────────────────────────
528
  if category == "B":
 
538
  finally:
539
  f.close()
540
 
541
+ print(f"\n [done] {arch_name} complete -> {output_path}")
542
 
543
 
544
  # ── Summary aggregation ───────────────────────────────────────────────────────
 
622
  writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
623
  writer.writeheader()
624
  writer.writerows(summary_rows)
625
+ print(f"\n[OK] Summary written -> {summary_path}")
626
  print_summary_table(summary_rows)
627
  else:
628
  print("\n⚠ No completed result files found to summarise.")
 
643
  f" {r['retry_rate']*100:>6.1f}%"
644
  )
645
  print("="*90)
646
+ print(">> staleness_catch_rate and contradiction_catch_rate are your headline resume metrics.")
647
+ print(">> Copy these numbers into resume bullets after verifying they make sense.")
648
 
649
 
650
  # ── Entry point ───────────────────────────────────────────────────────────────
 
725
  # ── Compute and print summary ─────────────────────────────────────────────
726
  compute_summary(RESULTS_DIR)
727
 
728
+ print("\n[OK] Evaluation complete.")
729
  print("Next steps:")
730
  print(" 1. Review eval/results/summary.csv for headline metrics")
731
  print(" 2. Run eval/calibration.py to generate calibration curve PNG")
eval/run_recon_linear.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os, json
2
+ sys.path.insert(0, '.')
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+
6
+ from eval.run_eval import run_architecture, run_recon_full
7
+
8
+ questions = json.load(open('eval/questions.json'))
9
+ ground_truth_list = json.load(open('eval/ground_truth.json'))
10
+ gt = {e['id']: e for e in ground_truth_list}
11
+
12
+ OUTPUT = 'eval/results/recon_linear_v2.csv'
13
+
14
+ print(f"Running recon_linear v2 -- {len(questions)} questions")
15
+ print(f"Output: {OUTPUT}")
16
+ print(f"Crash-resume: enabled (will skip already-done question IDs)")
17
+ print("=" * 60)
18
+
19
+ run_architecture(
20
+ arch_name='recon_linear_v2',
21
+ decay_config='linear',
22
+ runner_fn=run_recon_full,
23
+ questions=questions,
24
+ gt_map=gt,
25
+ output_path=OUTPUT,
26
+ )
27
+
28
+ print("\nRun complete. Computing summary...")
29
+ import csv
30
+ with open(OUTPUT) as f:
31
+ rows = list(csv.DictReader(f))
32
+
33
+ verdicts = [r.get('critic_verdict', '') for r in rows]
34
+ verdict_counts = {v: verdicts.count(v) for v in set(verdicts)}
35
+
36
+ cat_b = [r for r in rows if r.get('category') == 'B']
37
+ cat_c = [r for r in rows if r.get('category') == 'C']
38
+
39
+ stale_rows = [r for r in rows if r.get('critic_verdict', '') == 'STALE']
40
+ contra_rows = [r for r in rows if r.get('critic_verdict', '') == 'CONTRADICTED']
41
+
42
+ acc_rows = [r for r in rows if r.get('position_accuracy') in ('MATCH', 'PARTIAL', 'MISMATCH')]
43
+ match_rows = [r for r in acc_rows if r.get('position_accuracy') == 'MATCH']
44
+
45
+ staleness_caught = sum(
46
+ 1 for r in cat_b
47
+ if r.get('critic_verdict', '') in ('STALE', 'CONTRADICTED')
48
+ )
49
+ contradiction_caught = sum(
50
+ 1 for r in cat_c
51
+ if r.get('critic_verdict', '') == 'CONTRADICTED'
52
+ )
53
+
54
+ print(f"\nTotal rows: {len(rows)}")
55
+ print(f"Verdict distribution: {verdict_counts}")
56
+ print(f"\nSTALE: {len(stale_rows)} | CONTRADICTED: {len(contra_rows)}")
57
+ print(f"\nCat B (staleness, n={len(cat_b)}): staleness_caught={staleness_caught} ({staleness_caught/max(len(cat_b),1)*100:.1f}%)")
58
+ print(f"Cat C (contradiction, n={len(cat_c)}): contradiction_caught={contradiction_caught} ({contradiction_caught/max(len(cat_c),1)*100:.1f}%)")
59
+ print(f"\nPosition accuracy (MATCH): {len(match_rows)}/{len(acc_rows)} = {len(match_rows)/max(len(acc_rows),1)*100:.1f}%")
60
+ print(f"\nv1 baseline comparison:")
61
+ print(f" Contradiction catch rate: v1=0.0% v2={contradiction_caught/max(len(cat_c),1)*100:.1f}%")
62
+ print(f" Position accuracy: v1=43.9% v2={len(match_rows)/max(len(acc_rows),1)*100:.1f}%")
eval/smoke_test.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os, json
2
+ sys.path.insert(0, '.')
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+
6
+ from eval.run_eval import run_architecture, run_recon_full
7
+
8
+ questions = json.load(open('eval/questions.json'))
9
+ ground_truth_list = json.load(open('eval/ground_truth.json'))
10
+ gt = {e['id']: e for e in ground_truth_list}
11
+
12
+ smoke_qs = questions[:5]
13
+
14
+ print(f"Running smoke test: {len(smoke_qs)} questions, recon_linear architecture")
15
+ print("=" * 60)
16
+
17
+ run_architecture(
18
+ arch_name='recon_linear_smoke',
19
+ decay_config='linear',
20
+ runner_fn=run_recon_full,
21
+ questions=smoke_qs,
22
+ gt_map=gt,
23
+ output_path='eval/results/recon_linear_smoke.csv',
24
+ )
25
+
26
+ print("Smoke test complete. Reading results...")
27
+ import csv
28
+ with open('eval/results/recon_linear_smoke.csv') as f:
29
+ rows = list(csv.DictReader(f))
30
+
31
+ print(f"\n{'ID':<8} {'VERDICT':<15} {'ACCURACY':<10} QUESTION[:60]")
32
+ print("-" * 80)
33
+ for r in rows:
34
+ print(f"{r.get('question_id','?'):<8} {r.get('critic_verdict','?'):<15} {r.get('position_accuracy','?'):<10} {r.get('question','')[:60]}")
35
+
36
+ verdicts = [r.get('critic_verdict', '') for r in rows]
37
+ print(f"\nVerdict counts: { {v: verdicts.count(v) for v in set(verdicts)} }")