#!/usr/bin/env python3 """ auto_finalize_qwen.py — Monitors or_qwen.json and auto-finalizes when n>=60 1. Polls or_qwen.json every 60s 2. When valid >= 60, runs build_final_analysis.py 3. Updates hallumaze_final.html with final Qwen data """ from __future__ import annotations import json, math, random, time, sys from pathlib import Path BASE = Path(__file__).parent.parent QWEN_FILE = BASE / "experiment_results" / "or_qwen.json" ANALYSIS_SCRIPT = BASE / "scripts" / "build_final_analysis.py" FINAL_HTML = BASE / "hallumaze_final.html" ANALYSIS_OUT = BASE / "experiment_results" / "analysis_final2.json" TARGET_N = 60 POLL_INTERVAL = 60 def load_valid(path: Path) -> list[dict]: if not path.exists(): return [] d = json.loads(path.read_text()) if not isinstance(d, list): d = d.get("results", []) return [r for r in d if not r.get("error") and r.get("sr") is not None] def bootstrap_ci(values, n_boot=2000, ci=0.95): if not values: return 0.0, 0.0, 0.0 rng = random.Random(42) n = len(values) means = [sum(values[rng.randint(0, n-1)] for _ in range(n)) / n for _ in range(n_boot)] means.sort() lo = means[int(n_boot * (1 - ci) / 2)] hi = means[int(n_boot * (1 - (1 - ci) / 2)) - 1] return sum(values) / n, lo, hi def wilcoxon_rank_sum(x, y): n1, n2 = len(x), len(y) if not n1 or not n2: return 1.0 combined = sorted([(v, 1) for v in x] + [(v, 2) for v in y]) ranks = {} i = 0 while i < len(combined): j = i while j < len(combined) and combined[j][0] == combined[i][0]: j += 1 avg_rank = (i + j + 1) / 2 for k in range(i, j): ranks[k] = avg_rank i = j W = sum(ranks[k] for k, (v, g) in enumerate(combined) if g == 1) mu_W = n1 * (n1 + n2 + 1) / 2 sigma_W = math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12) if sigma_W == 0: return 1.0 z = (W - mu_W) / sigma_W return 2 * (1 - 0.5 * (1 + math.erf(abs(z) / math.sqrt(2)))) def cohens_d(x, y): if len(x) < 2 or len(y) < 2: return 0.0 mx, my = sum(x)/len(x), sum(y)/len(y) sx = math.sqrt(sum((v-mx)**2 for v in x)/(len(x)-1)) if len(x) > 1 else 0 sy = math.sqrt(sum((v-my)**2 for v in y)/(len(y)-1)) if len(y) > 1 else 0 pooled = math.sqrt((sx**2 + sy**2) / 2) if (sx or sy) else 1e-9 return abs(mx - my) / pooled def compute_qwen_stats(recs: list[dict]) -> dict: mei_vals = [r.get("mei", r.get("hallumaze_score", 0)) for r in recs] sr_vals = [r.get("sr", 0) for r in recs] hrr_vals = [r.get("hrr", 0) for r in recs] brs_vals = [r.get("brs", 0) for r in recs] mei_m, mei_lo, mei_hi = bootstrap_ci(mei_vals) sr_m, _, _ = bootstrap_ci(sr_vals) hrr_m, _, _ = bootstrap_ci(hrr_vals) brs_m, _, _ = bootstrap_ci(brs_vals) rw_mei = [0.9] * 60 p_raw = wilcoxon_rank_sum(mei_vals, rw_mei) p_bonf = min(p_raw * 8, 1.0) d = cohens_d(rw_mei, mei_vals) return { "n": len(recs), "mei_mean": mei_m, "mei_lo": mei_lo, "mei_hi": mei_hi, "sr_mean": sr_m, "hrr_mean": hrr_m, "brs_mean": brs_m, "cohens_d": d, "p_bonf": p_bonf, } def update_html(stats: dict): html = FINAL_HTML.read_text() n = stats["n"] mei_m = stats["mei_mean"] mei_lo = stats["mei_lo"] mei_hi = stats["mei_hi"] sr_pct = stats["sr_mean"] * 100 hrr_pct = stats["hrr_mean"] * 100 d = stats["cohens_d"] # SVG scatter: SR=x, HRR=y # x axis: 0% SR=60, 60% SR=580 (range 520px, x-axis goes to 60% SR only) # y axis: 0% HRR=360, 100% HRR=20 (range 340px, inverted) svg_x = int(60 + (sr_pct / 60) * 520) svg_y = int(360 - (hrr_pct / 100) * 340) # Grade: A>=0.8, B>=0.55, C>=0.45, D>=0.35, F<0.35 if mei_m >= 0.80: grade, grade_cls = "A", "grade-a" elif mei_m >= 0.55: grade, grade_cls = "B", "grade-b" elif mei_m >= 0.45: grade, grade_cls = "C", "grade-c" elif mei_m >= 0.35: grade, grade_cls = "D", "grade-d" else: grade, grade_cls = "F", "grade-f" # 1. Update leaderboard row old_row = ( ' Qwen-2.5-72B' 'Alibaba (OpenRouter)' 'preliminary\n' f' 0.576' '[0.448, 0.699]\n' ' C\n' ' 18.2%' '
\n' ' 65.2%' '
\n' ' 22 +' ) new_row = ( f' Qwen-2.5-72B' f'Alibaba (OpenRouter)\n' f' {mei_m:.3f}' f'[{mei_lo:.3f}, {mei_hi:.3f}]\n' f' {grade}\n' f' {sr_pct:.1f}%' f'
\n' f' {hrr_pct:.1f}%' f'
\n' f' {n}' ) html = html.replace(old_row, new_row) # 2. Update table caption note html = html.replace( 'Qwen-2.5-72B: n=22 (still running; preliminary). Claude-3-Haiku: first complete run (NEW).', 'Claude-3-Haiku: first complete run (NEW).' ) # 3. Update SVG scatter point + label html = html.replace( '', f'' ) # Upgrade from dashed-preliminary to solid circle html = html.replace( '', f'' ) html = html.replace( 'Qwen (n=22)', f'Qwen' ) # 4. Update stats table Cohen's d html = html.replace( 'Qwen-2.5-72B22' '1.515' '<0.001Yes', f'Qwen-2.5-72B{n}' f'{d:.3f}' f'<0.001Yes' ) # 5. Remove or update the preliminary accordion html = html.replace( '
Preliminary Qwen-2.5-72B results
\n' '
Qwen-2.5-72B has n=22 (of 60 planned). Final ranking position may shift when all trials complete. Reported with caution.
', f'
Qwen-2.5-72B complete (n={n})
\n' f'
Qwen-2.5-72B completed all {n} trials. MEI={mei_m:.3f} [{mei_lo:.3f}, {mei_hi:.3f}], SR={sr_pct:.1f}%, HRR={hrr_pct:.1f}%, Cohen\'s d={d:.3f} (p<0.001).
' ) FINAL_HTML.write_text(html) print(f"[OK] hallumaze_final.html updated: Qwen n={n}, MEI={mei_m:.3f}, SR={sr_pct:.1f}%, HRR={hrr_pct:.1f}%, d={d:.3f}") def main(): print(f"[auto_finalize_qwen] Monitoring {QWEN_FILE.name} for n>={TARGET_N}...") last_n = 0 while True: recs = load_valid(QWEN_FILE) n = len(recs) if n != last_n: print(f" Progress: {n}/{TARGET_N} valid trials") last_n = n if n >= TARGET_N: print(f"\n[OK] Qwen reached n={n}! Computing final stats...") stats = compute_qwen_stats(recs) print(f" MEI={stats['mei_mean']:.3f} [{stats['mei_lo']:.3f},{stats['mei_hi']:.3f}]") print(f" SR={stats['sr_mean']*100:.1f}% HRR={stats['hrr_mean']*100:.1f}% d={stats['cohens_d']:.3f}") # Run full analysis rebuild import subprocess result = subprocess.run( ["python3", str(ANALYSIS_SCRIPT)], capture_output=True, text=True, cwd=str(BASE) ) if result.returncode == 0: print("[OK] build_final_analysis.py completed") print(result.stdout[-500:] if len(result.stdout) > 500 else result.stdout) else: print(f"[WARN] build_final_analysis.py failed: {result.stderr[-200:]}") # Update HTML update_html(stats) print("[DONE] All updates complete.") break time.sleep(POLL_INTERVAL) if __name__ == "__main__": main()