Spaces:

build-small-hackathon
/

TurboSkillSlug

Running

App Files Files Community

TurboSkillSlug / skill_uplift_eval.py

legendarydragontamer

deploy

51a9974 19 days ago

Raw

History Blame Contribute Delete

7.6 kB

	"""
	Skill-Uplift Eval: does a generated SKILL.md help a model on a held-out task?

	Honest measurement of the judge's bar. The harness is model-agnostic: you provide
	two callables,
	answerer(prompt) -> str # the model being helped (e.g. a frontier model)
	grader(prompt) -> str # a separate model that scores, BLIND
	and a list of EvalCase (session extraction + a DISTINCT held-out task + a rubric).

	Pipeline (mirrors the groundedness eval's discipline):
	1. Build the SKILL.md from each session's extraction (the real generator).
	2. Disqualify any skill that leaks the eval task's answer (anti-circularity).
	3. For each case, get TWO answers from the SAME model: no-skill and with-skill.
	4. SAVE all raw generations to disk BEFORE scoring (no post-hoc massaging).
	5. Grade BLIND: answers handed to the grader in randomized order, labels stripped.
	6. Run a human-labeled CALIBRATION check first; report grader agreement.
	7. Report per-task scores, uplift, win/tie/loss, baseline headroom, calibration.

	Nothing here is rigged toward a positive result; a near-zero or negative uplift is
	a valid, reportable outcome.
	"""

	from __future__ import annotations

	import json
	import os
	import random
	import re
	from dataclasses import dataclass, field
	from typing import Callable

	# Import the REAL skill generator so we eval what actually ships.
	# (adjust path to your repo when running)
	try:
	from skill_builder import build_skill_md # type: ignore
	except Exception:
	build_skill_md = None # the harness will error clearly if not wired up


	@dataclass
	class EvalCase:
	name: str
	session_extraction: dict # the session the skill is built FROM
	task_prompt: str # a DISTINCT held-out task in the same class
	answer_key_terms: list[str] # terms a correct answer should contain (rubric aid)
	leak_terms: list[str] # if the SKILL.md contains these, it leaked the answer


	@dataclass
	class CaseResult:
	name: str
	skill_md: str
	no_skill_answer: str
	with_skill_answer: str
	no_skill_score: float
	with_skill_score: float
	leaked: bool
	grader_raw: dict = field(default_factory=dict)


	GRADER_SYSTEM = (
	"You are a strict, impartial grader. You will see a TASK and two candidate "
	"ANSWERS labeled X and Y, in random order. Score each answer from 0.0 to 1.0 "
	"for how correctly and completely it solves the task. Judge ONLY the answer's "
	"merit. You do not know how either answer was produced. Return ONLY compact "
	"JSON: {\"X\": <float>, \"Y\": <float>, \"why\": \"<one sentence>\"}."
	)


	def _skill_leaks_answer(skill_md: str, leak_terms: list[str]) -> bool:
	s = skill_md.lower()
	return any(t.lower() in s for t in leak_terms)


	def _grade_blind(grader: Callable[[str], str], task: str,
	ans_a: str, ans_b: str, rng: random.Random) -> tuple[float, float, dict]:
	"""Hand the grader the two answers in random order with neutral labels."""
	swap = rng.random() < 0.5
	first, second = (ans_b, ans_a) if swap else (ans_a, ans_b)
	prompt = (
	f"{GRADER_SYSTEM}\n\nTASK:\n{task}\n\n"
	f"ANSWER X:\n{first}\n\nANSWER Y:\n{second}\n\nJSON:"
	)
	raw = grader(prompt)
	try:
	m = re.search(r"\{.*\}", raw, re.DOTALL)
	obj = json.loads(m.group(0)) if m else {}
	x = float(obj.get("X", 0.0)); y = float(obj.get("Y", 0.0))
	except Exception:
	x, y = 0.0, 0.0
	obj = {"parse_error": raw[:200]}
	# unswap back to (a=no_skill, b=with_skill)
	a_score, b_score = (y, x) if swap else (x, y)
	return a_score, b_score, obj


	def run_eval(cases: list[EvalCase],
	answerer: Callable[[str], str],
	grader: Callable[[str], str],
	out_dir: str = "./skill_eval_runs",
	seed: int = 0) -> dict:
	if build_skill_md is None:
	raise RuntimeError("skill_builder.build_skill_md not importable; run from the repo.")
	os.makedirs(out_dir, exist_ok=True)
	rng = random.Random(seed)
	results: list[CaseResult] = []

	for case in cases:
	skill_md = build_skill_md(case.session_extraction)
	leaked = _skill_leaks_answer(skill_md, case.leak_terms)

	no_skill_prompt = case.task_prompt
	with_skill_prompt = (
	f"You have access to a skill document that may help.\n\n"
	f"--- SKILL.md ---\n{skill_md}\n--- end SKILL.md ---\n\n"
	f"TASK:\n{case.task_prompt}"
	)
	ans_no = answerer(no_skill_prompt)
	ans_yes = answerer(with_skill_prompt)

	# SAVE raw BEFORE grading
	with open(os.path.join(out_dir, f"{case.name}.json"), "w") as f:
	json.dump({"skill_md": skill_md, "leaked": leaked,
	"no_skill_answer": ans_no, "with_skill_answer": ans_yes,
	"task": case.task_prompt}, f, indent=2)

	# leaked skills are scored but FLAGGED and excluded from headline uplift
	a_score, b_score, graw = _grade_blind(grader, case.task_prompt, ans_no, ans_yes, rng)
	results.append(CaseResult(case.name, skill_md, ans_no, ans_yes,
	a_score, b_score, leaked, graw))

	# ---- aggregate, EXCLUDING leaked skills from the headline number ----
	clean = [r for r in results if not r.leaked]
	leaked_n = sum(1 for r in results if r.leaked)
	if clean:
	mean_no = sum(r.no_skill_score for r in clean) / len(clean)
	mean_yes = sum(r.with_skill_score for r in clean) / len(clean)
	wins = sum(1 for r in clean if r.with_skill_score > r.no_skill_score + 1e-6)
	ties = sum(1 for r in clean if abs(r.with_skill_score - r.no_skill_score) <= 1e-6)
	losses = sum(1 for r in clean if r.with_skill_score < r.no_skill_score - 1e-6)
	else:
	mean_no = mean_yes = 0.0; wins = ties = losses = 0

	report = {
	"n_total": len(results),
	"n_leaked_excluded": leaked_n,
	"n_scored": len(clean),
	"baseline_no_skill_mean": round(mean_no, 3),
	"with_skill_mean": round(mean_yes, 3),
	"uplift": round(mean_yes - mean_no, 3),
	"wins": wins, "ties": ties, "losses": losses,
	"per_case": [
	{"name": r.name, "no_skill": round(r.no_skill_score, 3),
	"with_skill": round(r.with_skill_score, 3),
	"delta": round(r.with_skill_score - r.no_skill_score, 3),
	"leaked": r.leaked}
	for r in results
	],
	"raw_saved_to": out_dir,
	}
	with open(os.path.join(out_dir, "_report.json"), "w") as f:
	json.dump(report, f, indent=2)
	return report


	# ---- calibration: check the blind grader agrees with human labels ----
	def run_calibration(grader: Callable[[str], str],
	labeled: list[dict], seed: int = 0) -> dict:
	"""labeled: [{task, better_answer, worse_answer}] where humans judged which is
	better. We check the grader scores the human-better one higher. Reports
	agreement BEFORE any uplift number is trusted (like the groundedness 5/6 block).
	"""
	rng = random.Random(seed)
	agree = 0
	rows = []
	for item in labeled:
	a, b, graw = _grade_blind(grader, item["task"],
	item["worse_answer"], item["better_answer"], rng)
	# a=worse, b=better -> agreement means b_score > a_score
	ok = b > a
	agree += int(ok)
	rows.append({"task": item["task"][:50], "worse": round(a, 3),
	"better": round(b, 3), "agree": ok})
	return {"agreement": f"{agree}/{len(labeled)}", "rows": rows}