Spaces:

build-small-hackathon
/

TurboSkillSlug

Sleeping

App Files Files Community

TurboSkillSlug / distractor_eval.py

legendarydragontamer

deploy

51a9974 16 days ago

Raw

History Blame Contribute Delete

5.81 kB

	"""
	Distractor-trap eval: does the skill steer a frontier model AWAY from the tempting
	wrong approach it would otherwise take?

	Scoring is two-layer and deliberately objective-first:

	1. SIGNATURE score (objective, no model judgment): scan the answer for trap vs
	correct signatures. trap_present and correct_present are booleans. The
	"avoided-trap" score = correct_present AND NOT trap_present.

	2. BLIND QUALITY score (a separate model, optional backstop): only to confirm
	the correct-signature answer is actually a good answer, not a false positive.

	The headline metric is the AVOIDANCE RATE: how often the model avoided the trap
	WITH the skill vs WITHOUT. This is the thing a frontier model genuinely cannot get
	from weights alone — knowledge of which tempting path is a dead end in context.

	Raw answers are saved before scoring. Leaked skills are excluded.
	"""

	from __future__ import annotations

	import json
	import os
	import re

	try:
	from skill_builder import build_skill_md
	except Exception:
	build_skill_md = None


	from dataclasses import dataclass, field


	@dataclass
	class DistractorCase:
	"""A task whose OBVIOUS approach is wrong; the skill's gotcha warns of the trap."""
	name: str
	session_extraction: dict # the session the skill is built FROM
	task_prompt: str # task whose naive solution is the trap
	trap_signature: list # (legacy heuristic, retained for reference)
	correct_signature: list # (legacy heuristic, retained for reference)
	leak_terms: list = field(default_factory=list) # skill leaked the task answer
	trap_desc: str = "" # plain description of the trap, for the LLM judge
	correct_desc: str = "" # plain description of the correct approach



	DISTRACTOR_JUDGE_SYSTEM = (
	"You are a strict, impartial code reviewer. You will see a TASK, a description "
	"of a KNOWN TRAP (a tempting but wrong approach), the CORRECT approach, and ONE "
	"candidate ANSWER. Decide what the answer ACTUALLY RECOMMENDS as its primary "
	"solution. Mentioning the trap in order to WARN against it is GOOD, not a "
	"failure. Return ONLY compact JSON: "
	'{"primary": "trap" \| "correct" \| "unclear", '
	'"warns_about_trap": true\|false, '
	'"why": "<one sentence>"}'
	)


	def _judge_answer(grader, task, trap_desc, correct_desc, answer):
	prompt = (
	f"{DISTRACTOR_JUDGE_SYSTEM}\n\nTASK:\n{task}\n\n"
	f"KNOWN TRAP (tempting but wrong): {trap_desc}\n"
	f"CORRECT approach: {correct_desc}\n\n"
	f"ANSWER:\n{answer}\n\nJSON:"
	)
	raw = grader(prompt)
	try:
	m = re.search(r"\{.*\}", raw, re.DOTALL)
	obj = json.loads(m.group(0)) if m else {}
	except Exception:
	obj = {"primary": "unclear", "warns_about_trap": False, "parse_error": raw[:200]}
	obj["avoided"] = (obj.get("primary") == "correct")
	return obj


	def _has_any(text: str, needles: list[str]) -> bool:
	t = text.lower()
	return any(n.lower() in t for n in needles)


	def _avoided_trap(answer: str, trap_sig: list[str], correct_sig: list[str]) -> dict:
	trap = _has_any(answer, trap_sig)
	correct = _has_any(answer, correct_sig)
	# avoided = used the correct approach AND did not lead with the trap
	avoided = correct and not trap
	return {"trap_present": trap, "correct_present": correct, "avoided": avoided}


	def run_distractor_eval(cases, answerer, grader, out_dir="./distractor_runs", seed=0):
	if build_skill_md is None:
	raise RuntimeError("skill_builder.build_skill_md not importable; run from the repo.")
	os.makedirs(out_dir, exist_ok=True)
	rows = []
	for case in cases:
	skill_md = build_skill_md(case.session_extraction)
	leaked = _has_any(skill_md, case.leak_terms)

	no_skill_ans = answerer(case.task_prompt)
	with_skill_ans = answerer(
	"You have access to a skill document that may help.\n\n"
	f"--- SKILL.md ---\n{skill_md}\n--- end SKILL.md ---\n\n"
	f"TASK:\n{case.task_prompt}"
	)

	with open(os.path.join(out_dir, f"{case.name}.json"), "w") as f:
	json.dump({"skill_md": skill_md, "leaked": leaked,
	"no_skill": no_skill_ans, "with_skill": with_skill_ans,
	"task": case.task_prompt}, f, indent=2)

	td = case.trap_desc or "the naive/obvious approach"
	cd = case.correct_desc or "the non-obvious correct approach"
	ns = _judge_answer(grader, case.task_prompt, td, cd, no_skill_ans)
	ws = _judge_answer(grader, case.task_prompt, td, cd, with_skill_ans)
	rows.append({"name": case.name, "leaked": leaked,
	"no_skill_avoided": ns["avoided"], "with_skill_avoided": ws["avoided"],
	"no_skill_detail": ns, "with_skill_detail": ws})

	clean = [r for r in rows if not r["leaked"]]
	n = len(clean)
	ns_rate = sum(r["no_skill_avoided"] for r in clean) / n if n else 0
	ws_rate = sum(r["with_skill_avoided"] for r in clean) / n if n else 0
	# "rescues": cases the skill FLIPPED from trapped -> avoided
	rescues = sum(1 for r in clean if r["with_skill_avoided"] and not r["no_skill_avoided"])
	regressions = sum(1 for r in clean if not r["with_skill_avoided"] and r["no_skill_avoided"])

	report = {
	"n_scored": n, "n_leaked_excluded": len(rows) - n,
	"no_skill_avoidance_rate": round(ns_rate, 3),
	"with_skill_avoidance_rate": round(ws_rate, 3),
	"avoidance_uplift": round(ws_rate - ns_rate, 3),
	"rescues": rescues, "regressions": regressions,
	"per_case": rows, "raw_saved_to": out_dir,
	}
	with open(os.path.join(out_dir, "_report.json"), "w") as f:
	json.dump(report, f, indent=2)
	return report