| """ |
| Skill-Uplift Eval: does a generated SKILL.md help a model on a held-out task? |
| |
| Honest measurement of the judge's bar. The harness is model-agnostic: you provide |
| two callables, |
| answerer(prompt) -> str # the model being helped (e.g. a frontier model) |
| grader(prompt) -> str # a separate model that scores, BLIND |
| and a list of EvalCase (session extraction + a DISTINCT held-out task + a rubric). |
| |
| Pipeline (mirrors the groundedness eval's discipline): |
| 1. Build the SKILL.md from each session's extraction (the real generator). |
| 2. Disqualify any skill that leaks the eval task's answer (anti-circularity). |
| 3. For each case, get TWO answers from the SAME model: no-skill and with-skill. |
| 4. SAVE all raw generations to disk BEFORE scoring (no post-hoc massaging). |
| 5. Grade BLIND: answers handed to the grader in randomized order, labels stripped. |
| 6. Run a human-labeled CALIBRATION check first; report grader agreement. |
| 7. Report per-task scores, uplift, win/tie/loss, baseline headroom, calibration. |
| |
| Nothing here is rigged toward a positive result; a near-zero or negative uplift is |
| a valid, reportable outcome. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| import random |
| import re |
| from dataclasses import dataclass, field |
| from typing import Callable |
|
|
| |
| |
| try: |
| from skill_builder import build_skill_md |
| except Exception: |
| build_skill_md = None |
|
|
|
|
| @dataclass |
| class EvalCase: |
| name: str |
| session_extraction: dict |
| task_prompt: str |
| answer_key_terms: list[str] |
| leak_terms: list[str] |
|
|
|
|
| @dataclass |
| class CaseResult: |
| name: str |
| skill_md: str |
| no_skill_answer: str |
| with_skill_answer: str |
| no_skill_score: float |
| with_skill_score: float |
| leaked: bool |
| grader_raw: dict = field(default_factory=dict) |
|
|
|
|
| GRADER_SYSTEM = ( |
| "You are a strict, impartial grader. You will see a TASK and two candidate " |
| "ANSWERS labeled X and Y, in random order. Score each answer from 0.0 to 1.0 " |
| "for how correctly and completely it solves the task. Judge ONLY the answer's " |
| "merit. You do not know how either answer was produced. Return ONLY compact " |
| "JSON: {\"X\": <float>, \"Y\": <float>, \"why\": \"<one sentence>\"}." |
| ) |
|
|
|
|
| def _skill_leaks_answer(skill_md: str, leak_terms: list[str]) -> bool: |
| s = skill_md.lower() |
| return any(t.lower() in s for t in leak_terms) |
|
|
|
|
| def _grade_blind(grader: Callable[[str], str], task: str, |
| ans_a: str, ans_b: str, rng: random.Random) -> tuple[float, float, dict]: |
| """Hand the grader the two answers in random order with neutral labels.""" |
| swap = rng.random() < 0.5 |
| first, second = (ans_b, ans_a) if swap else (ans_a, ans_b) |
| prompt = ( |
| f"{GRADER_SYSTEM}\n\nTASK:\n{task}\n\n" |
| f"ANSWER X:\n{first}\n\nANSWER Y:\n{second}\n\nJSON:" |
| ) |
| raw = grader(prompt) |
| try: |
| m = re.search(r"\{.*\}", raw, re.DOTALL) |
| obj = json.loads(m.group(0)) if m else {} |
| x = float(obj.get("X", 0.0)); y = float(obj.get("Y", 0.0)) |
| except Exception: |
| x, y = 0.0, 0.0 |
| obj = {"parse_error": raw[:200]} |
| |
| a_score, b_score = (y, x) if swap else (x, y) |
| return a_score, b_score, obj |
|
|
|
|
| def run_eval(cases: list[EvalCase], |
| answerer: Callable[[str], str], |
| grader: Callable[[str], str], |
| out_dir: str = "./skill_eval_runs", |
| seed: int = 0) -> dict: |
| if build_skill_md is None: |
| raise RuntimeError("skill_builder.build_skill_md not importable; run from the repo.") |
| os.makedirs(out_dir, exist_ok=True) |
| rng = random.Random(seed) |
| results: list[CaseResult] = [] |
|
|
| for case in cases: |
| skill_md = build_skill_md(case.session_extraction) |
| leaked = _skill_leaks_answer(skill_md, case.leak_terms) |
|
|
| no_skill_prompt = case.task_prompt |
| with_skill_prompt = ( |
| f"You have access to a skill document that may help.\n\n" |
| f"--- SKILL.md ---\n{skill_md}\n--- end SKILL.md ---\n\n" |
| f"TASK:\n{case.task_prompt}" |
| ) |
| ans_no = answerer(no_skill_prompt) |
| ans_yes = answerer(with_skill_prompt) |
|
|
| |
| with open(os.path.join(out_dir, f"{case.name}.json"), "w") as f: |
| json.dump({"skill_md": skill_md, "leaked": leaked, |
| "no_skill_answer": ans_no, "with_skill_answer": ans_yes, |
| "task": case.task_prompt}, f, indent=2) |
|
|
| |
| a_score, b_score, graw = _grade_blind(grader, case.task_prompt, ans_no, ans_yes, rng) |
| results.append(CaseResult(case.name, skill_md, ans_no, ans_yes, |
| a_score, b_score, leaked, graw)) |
|
|
| |
| clean = [r for r in results if not r.leaked] |
| leaked_n = sum(1 for r in results if r.leaked) |
| if clean: |
| mean_no = sum(r.no_skill_score for r in clean) / len(clean) |
| mean_yes = sum(r.with_skill_score for r in clean) / len(clean) |
| wins = sum(1 for r in clean if r.with_skill_score > r.no_skill_score + 1e-6) |
| ties = sum(1 for r in clean if abs(r.with_skill_score - r.no_skill_score) <= 1e-6) |
| losses = sum(1 for r in clean if r.with_skill_score < r.no_skill_score - 1e-6) |
| else: |
| mean_no = mean_yes = 0.0; wins = ties = losses = 0 |
|
|
| report = { |
| "n_total": len(results), |
| "n_leaked_excluded": leaked_n, |
| "n_scored": len(clean), |
| "baseline_no_skill_mean": round(mean_no, 3), |
| "with_skill_mean": round(mean_yes, 3), |
| "uplift": round(mean_yes - mean_no, 3), |
| "wins": wins, "ties": ties, "losses": losses, |
| "per_case": [ |
| {"name": r.name, "no_skill": round(r.no_skill_score, 3), |
| "with_skill": round(r.with_skill_score, 3), |
| "delta": round(r.with_skill_score - r.no_skill_score, 3), |
| "leaked": r.leaked} |
| for r in results |
| ], |
| "raw_saved_to": out_dir, |
| } |
| with open(os.path.join(out_dir, "_report.json"), "w") as f: |
| json.dump(report, f, indent=2) |
| return report |
|
|
|
|
| |
| def run_calibration(grader: Callable[[str], str], |
| labeled: list[dict], seed: int = 0) -> dict: |
| """labeled: [{task, better_answer, worse_answer}] where humans judged which is |
| better. We check the grader scores the human-better one higher. Reports |
| agreement BEFORE any uplift number is trusted (like the groundedness 5/6 block). |
| """ |
| rng = random.Random(seed) |
| agree = 0 |
| rows = [] |
| for item in labeled: |
| a, b, graw = _grade_blind(grader, item["task"], |
| item["worse_answer"], item["better_answer"], rng) |
| |
| ok = b > a |
| agree += int(ok) |
| rows.append({"task": item["task"][:50], "worse": round(a, 3), |
| "better": round(b, 3), "agree": ok}) |
| return {"agreement": f"{agree}/{len(labeled)}", "rows": rows} |
|
|