""" Run the distractor-trap eval against a real frontier model via OpenRouter. This tests the slug's REAL value proposition: does the skill's negative knowledge (the documented trap) steer a frontier model away from the tempting wrong approach it would otherwise take? This is the gap a frontier model cannot fill from weights. PREREQS (never hardcode the key): export OPENROUTER_API_KEY=sk-or-... export EVAL_ANSWERER_MODEL=anthropic/claude-opus-4.6 # optional override Run from the repo root (imports the real skill_builder): python run_distractor_eval.py """ from __future__ import annotations import sys from openrouter_client import make_completer, verify_models, DEFAULT_ANSWERER_MODEL, DEFAULT_GRADER_MODEL from distractor_eval import DISTRACTOR_JUDGE_SYSTEM from distractor_eval import run_distractor_eval from distractor_cases import CASES def main() -> int: model = DEFAULT_ANSWERER_MODEL grader_model = DEFAULT_GRADER_MODEL if model == grader_model: print("WARNING: answerer and grader are the same model; set EVAL_GRADER_MODEL " "to a different model to keep the judge independent.") print(f"Verifying models live on OpenRouter...") status = verify_models(model, grader_model) for mid, ok in status.items(): print(f" {mid}: {'OK' if ok is True else ok}") if not all(v is True for v in status.values()): print("Set EVAL_ANSWERER_MODEL / EVAL_GRADER_MODEL to current ids (https://openrouter.ai/models).") return 1 # Low temperature: we want the model's DEFAULT instinct, not creative variance. answerer = make_completer(model, temperature=0.1, max_tokens=700) grader = make_completer(grader_model, system=DISTRACTOR_JUDGE_SYSTEM, temperature=0.0, max_tokens=200) print("\n--- Running distractor-trap eval ---") print("(does the skill steer the model away from the tempting wrong approach?)") print(f"answerer: {model} judge: {grader_model}") rep = run_distractor_eval(CASES, answerer, grader, out_dir="./distractor_runs") print("\n========== RESULT ==========") print(f"answerer: {model}") print(f"cases scored: {rep['n_scored']} (leaked excluded: {rep['n_leaked_excluded']})") print(f"trap-avoidance WITHOUT skill: {rep['no_skill_avoidance_rate']}") print(f"trap-avoidance WITH skill: {rep['with_skill_avoidance_rate']}") print(f"AVOIDANCE UPLIFT: {rep['avoidance_uplift']}") print(f"rescues (trapped->avoided): {rep['rescues']}") print(f"regressions (avoided->trapped): {rep['regressions']}") print("\nper-case:") for r in rep["per_case"]: flag = " [LEAKED]" if r["leaked"] else "" print(f" {r['name']}: without={r['no_skill_avoided']} with={r['with_skill_avoided']}{flag}") print(f" without: {r['no_skill_detail']}") print(f" with: {r['with_skill_detail']}") print("\nraw answers saved to:", rep["raw_saved_to"]) print("\nInterpretation:") print(" - rescues > 0 : the skill genuinely helped (model fell for the trap") print(" unaided, avoided it with the skill). This is the slug's value.") print(" - uplift ~0, both already high : the frontier model didn't need the skill") print(" for these traps (they were not hard enough). Report honestly.") print(" - regressions > 0 : the skill MISLED the model. Report this too.") return 0 if __name__ == "__main__": sys.exit(main())