""" Run the skill-uplift eval against real frontier models via OpenRouter. PREREQUISITES (do these first; never hardcode the key): export OPENROUTER_API_KEY=sk-or-... # optional overrides if the catalog has moved: export EVAL_ANSWERER_MODEL=anthropic/claude-opus-4.6 export EVAL_GRADER_MODEL=openai/gpt-5.1 Run from your repo root so it imports the REAL skill_builder.build_skill_md: python run_skill_eval.py It will: verify the models are live -> run grader calibration -> run the eval -> write skill_eval_runs/_report.json and print a plain-language summary. It reports whatever number comes out. A modest or zero uplift is a valid, honest result. """ from __future__ import annotations import json import sys from openrouter_client import ( make_completer, verify_models, DEFAULT_ANSWERER_MODEL, DEFAULT_GRADER_MODEL, ) from skill_uplift_eval import run_eval, run_calibration, GRADER_SYSTEM from sample_cases import CASES # A tiny human-labeled calibration set: pairs where WE judged which answer is # better. The grader should agree. Swap in your own hand-labeled pairs. CALIBRATION = [ { "task": "Efficiently compute the sum of values on every root-to-leaf path's max in a tree.", "better_answer": "Do a single post-order (bottom-up) DFS; at each node combine the max of its children's results in O(n) total.", "worse_answer": "From each node, separately walk down every path to a leaf and take the max; simple but O(n^2) or worse.", }, { "task": "Find the absorption probability in a Markov chain with two absorbing states.", "better_answer": "Write first-step equations for the transient states only, excluding absorbing rows, and solve the linear system.", "worse_answer": "Run a large Monte Carlo simulation and estimate the fraction of runs that end in the target state.", }, { "task": "Avoid a stack overflow when recursing over a very deep tree in Python.", "better_answer": "Convert the recursion to an explicit stack/iterative form, or raise the recursion limit knowingly; the depth is the real cause.", "worse_answer": "Wrap the recursion in a try/except and hope it does not overflow.", }, ] def main() -> int: answerer_model = DEFAULT_ANSWERER_MODEL grader_model = DEFAULT_GRADER_MODEL if answerer_model == grader_model: print("WARNING: answerer and grader are the same model; pick different " "models to reduce self-grading bias (set EVAL_GRADER_MODEL).") print(f"Verifying models are live on OpenRouter...") status = verify_models(answerer_model, grader_model) for mid, ok in status.items(): print(f" {mid}: {'OK' if ok is True else ok}") if not all(v is True for v in status.values()): print("\nOne or more model ids are not in the live catalog. Set " "EVAL_ANSWERER_MODEL / EVAL_GRADER_MODEL to current ids " "(see https://openrouter.ai/models) and re-run.") return 1 answerer = make_completer(answerer_model, temperature=0.2, max_tokens=900) grader = make_completer(grader_model, system=GRADER_SYSTEM, temperature=0.0, max_tokens=300) print("\n--- Calibration (does the blind grader agree with human labels?) ---") cal = run_calibration(grader, CALIBRATION) print("grader agreement:", cal["agreement"]) for row in cal["rows"]: print(f" agree={row['agree']} better={row['better']} worse={row['worse']} {row['task']}") print("\n--- Running skill-uplift eval ---") report = run_eval(CASES, answerer, grader, out_dir="./skill_eval_runs") print("\n========== RESULT ==========") print(f"answerer: {answerer_model} grader: {grader_model}") print(f"calibration: {cal['agreement']}") print(f"cases scored: {report['n_scored']} (excluded as leaked: {report['n_leaked_excluded']})") print(f"baseline (no skill): {report['baseline_no_skill_mean']}") print(f"with skill: {report['with_skill_mean']}") print(f"UPLIFT: {report['uplift']}") print(f"win / tie / loss: {report['wins']} / {report['ties']} / {report['losses']}") print("per-case:") for c in report["per_case"]: flag = " [LEAKED, excluded]" if c["leaked"] else "" print(f" {c['name']}: no={c['no_skill']} with={c['with_skill']} Δ={c['delta']}{flag}") print("\nraw generations saved to:", report["raw_saved_to"]) print("\nReport the number above as-is. Modest or zero uplift is a valid result.") return 0 if __name__ == "__main__": sys.exit(main())