| """ |
| Run the distractor-trap eval against a real frontier model via OpenRouter. |
| |
| This tests the slug's REAL value proposition: does the skill's negative knowledge |
| (the documented trap) steer a frontier model away from the tempting wrong approach |
| it would otherwise take? This is the gap a frontier model cannot fill from weights. |
| |
| PREREQS (never hardcode the key): |
| export OPENROUTER_API_KEY=sk-or-... |
| export EVAL_ANSWERER_MODEL=anthropic/claude-opus-4.6 # optional override |
| Run from the repo root (imports the real skill_builder): |
| python run_distractor_eval.py |
| """ |
|
|
| from __future__ import annotations |
|
|
| import sys |
|
|
| from openrouter_client import make_completer, verify_models, DEFAULT_ANSWERER_MODEL, DEFAULT_GRADER_MODEL |
| from distractor_eval import DISTRACTOR_JUDGE_SYSTEM |
| from distractor_eval import run_distractor_eval |
| from distractor_cases import CASES |
|
|
|
|
| def main() -> int: |
| model = DEFAULT_ANSWERER_MODEL |
| grader_model = DEFAULT_GRADER_MODEL |
| if model == grader_model: |
| print("WARNING: answerer and grader are the same model; set EVAL_GRADER_MODEL " |
| "to a different model to keep the judge independent.") |
| print(f"Verifying models live on OpenRouter...") |
| status = verify_models(model, grader_model) |
| for mid, ok in status.items(): |
| print(f" {mid}: {'OK' if ok is True else ok}") |
| if not all(v is True for v in status.values()): |
| print("Set EVAL_ANSWERER_MODEL / EVAL_GRADER_MODEL to current ids (https://openrouter.ai/models).") |
| return 1 |
|
|
| |
| answerer = make_completer(model, temperature=0.1, max_tokens=700) |
| grader = make_completer(grader_model, system=DISTRACTOR_JUDGE_SYSTEM, |
| temperature=0.0, max_tokens=200) |
|
|
| print("\n--- Running distractor-trap eval ---") |
| print("(does the skill steer the model away from the tempting wrong approach?)") |
| print(f"answerer: {model} judge: {grader_model}") |
| rep = run_distractor_eval(CASES, answerer, grader, out_dir="./distractor_runs") |
|
|
| print("\n========== RESULT ==========") |
| print(f"answerer: {model}") |
| print(f"cases scored: {rep['n_scored']} (leaked excluded: {rep['n_leaked_excluded']})") |
| print(f"trap-avoidance WITHOUT skill: {rep['no_skill_avoidance_rate']}") |
| print(f"trap-avoidance WITH skill: {rep['with_skill_avoidance_rate']}") |
| print(f"AVOIDANCE UPLIFT: {rep['avoidance_uplift']}") |
| print(f"rescues (trapped->avoided): {rep['rescues']}") |
| print(f"regressions (avoided->trapped): {rep['regressions']}") |
| print("\nper-case:") |
| for r in rep["per_case"]: |
| flag = " [LEAKED]" if r["leaked"] else "" |
| print(f" {r['name']}: without={r['no_skill_avoided']} with={r['with_skill_avoided']}{flag}") |
| print(f" without: {r['no_skill_detail']}") |
| print(f" with: {r['with_skill_detail']}") |
| print("\nraw answers saved to:", rep["raw_saved_to"]) |
| print("\nInterpretation:") |
| print(" - rescues > 0 : the skill genuinely helped (model fell for the trap") |
| print(" unaided, avoided it with the skill). This is the slug's value.") |
| print(" - uplift ~0, both already high : the frontier model didn't need the skill") |
| print(" for these traps (they were not hard enough). Report honestly.") |
| print(" - regressions > 0 : the skill MISLED the model. Report this too.") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|