Spaces:

build-small-hackathon
/

TurboSkillSlug

Running

App Files Files Community

TurboSkillSlug / run_distractor_eval.py

legendarydragontamer

deploy

51a9974 18 days ago

Raw

History Blame Contribute Delete

3.53 kB

	"""
	Run the distractor-trap eval against a real frontier model via OpenRouter.

	This tests the slug's REAL value proposition: does the skill's negative knowledge
	(the documented trap) steer a frontier model away from the tempting wrong approach
	it would otherwise take? This is the gap a frontier model cannot fill from weights.

	PREREQS (never hardcode the key):
	export OPENROUTER_API_KEY=sk-or-...
	export EVAL_ANSWERER_MODEL=anthropic/claude-opus-4.6 # optional override
	Run from the repo root (imports the real skill_builder):
	python run_distractor_eval.py
	"""

	from __future__ import annotations

	import sys

	from openrouter_client import make_completer, verify_models, DEFAULT_ANSWERER_MODEL, DEFAULT_GRADER_MODEL
	from distractor_eval import DISTRACTOR_JUDGE_SYSTEM
	from distractor_eval import run_distractor_eval
	from distractor_cases import CASES


	def main() -> int:
	model = DEFAULT_ANSWERER_MODEL
	grader_model = DEFAULT_GRADER_MODEL
	if model == grader_model:
	print("WARNING: answerer and grader are the same model; set EVAL_GRADER_MODEL "
	"to a different model to keep the judge independent.")
	print(f"Verifying models live on OpenRouter...")
	status = verify_models(model, grader_model)
	for mid, ok in status.items():
	print(f" {mid}: {'OK' if ok is True else ok}")
	if not all(v is True for v in status.values()):
	print("Set EVAL_ANSWERER_MODEL / EVAL_GRADER_MODEL to current ids (https://openrouter.ai/models).")
	return 1

	# Low temperature: we want the model's DEFAULT instinct, not creative variance.
	answerer = make_completer(model, temperature=0.1, max_tokens=700)
	grader = make_completer(grader_model, system=DISTRACTOR_JUDGE_SYSTEM,
	temperature=0.0, max_tokens=200)

	print("\n--- Running distractor-trap eval ---")
	print("(does the skill steer the model away from the tempting wrong approach?)")
	print(f"answerer: {model} judge: {grader_model}")
	rep = run_distractor_eval(CASES, answerer, grader, out_dir="./distractor_runs")

	print("\n========== RESULT ==========")
	print(f"answerer: {model}")
	print(f"cases scored: {rep['n_scored']} (leaked excluded: {rep['n_leaked_excluded']})")
	print(f"trap-avoidance WITHOUT skill: {rep['no_skill_avoidance_rate']}")
	print(f"trap-avoidance WITH skill: {rep['with_skill_avoidance_rate']}")
	print(f"AVOIDANCE UPLIFT: {rep['avoidance_uplift']}")
	print(f"rescues (trapped->avoided): {rep['rescues']}")
	print(f"regressions (avoided->trapped): {rep['regressions']}")
	print("\nper-case:")
	for r in rep["per_case"]:
	flag = " [LEAKED]" if r["leaked"] else ""
	print(f" {r['name']}: without={r['no_skill_avoided']} with={r['with_skill_avoided']}{flag}")
	print(f" without: {r['no_skill_detail']}")
	print(f" with: {r['with_skill_detail']}")
	print("\nraw answers saved to:", rep["raw_saved_to"])
	print("\nInterpretation:")
	print(" - rescues > 0 : the skill genuinely helped (model fell for the trap")
	print(" unaided, avoided it with the skill). This is the slug's value.")
	print(" - uplift ~0, both already high : the frontier model didn't need the skill")
	print(" for these traps (they were not hard enough). Report honestly.")
	print(" - regressions > 0 : the skill MISLED the model. Report this too.")
	return 0


	if __name__ == "__main__":
	sys.exit(main())