Spaces:

build-small-hackathon
/

TurboSkillSlug

Sleeping

App Files Files Community

TurboSkillSlug / run_skill_eval.py

legendarydragontamer

deploy

51a9974 15 days ago

Raw

History Blame Contribute Delete

4.63 kB

	"""
	Run the skill-uplift eval against real frontier models via OpenRouter.

	PREREQUISITES (do these first; never hardcode the key):
	export OPENROUTER_API_KEY=sk-or-...
	# optional overrides if the catalog has moved:
	export EVAL_ANSWERER_MODEL=anthropic/claude-opus-4.6
	export EVAL_GRADER_MODEL=openai/gpt-5.1
	Run from your repo root so it imports the REAL skill_builder.build_skill_md:
	python run_skill_eval.py

	It will: verify the models are live -> run grader calibration -> run the eval ->
	write skill_eval_runs/_report.json and print a plain-language summary. It reports
	whatever number comes out. A modest or zero uplift is a valid, honest result.
	"""

	from __future__ import annotations

	import json
	import sys

	from openrouter_client import (
	make_completer, verify_models,
	DEFAULT_ANSWERER_MODEL, DEFAULT_GRADER_MODEL,
	)
	from skill_uplift_eval import run_eval, run_calibration, GRADER_SYSTEM
	from sample_cases import CASES

	# A tiny human-labeled calibration set: pairs where WE judged which answer is
	# better. The grader should agree. Swap in your own hand-labeled pairs.
	CALIBRATION = [
	{
	"task": "Efficiently compute the sum of values on every root-to-leaf path's max in a tree.",
	"better_answer": "Do a single post-order (bottom-up) DFS; at each node combine the max of its children's results in O(n) total.",
	"worse_answer": "From each node, separately walk down every path to a leaf and take the max; simple but O(n^2) or worse.",
	},
	{
	"task": "Find the absorption probability in a Markov chain with two absorbing states.",
	"better_answer": "Write first-step equations for the transient states only, excluding absorbing rows, and solve the linear system.",
	"worse_answer": "Run a large Monte Carlo simulation and estimate the fraction of runs that end in the target state.",
	},
	{
	"task": "Avoid a stack overflow when recursing over a very deep tree in Python.",
	"better_answer": "Convert the recursion to an explicit stack/iterative form, or raise the recursion limit knowingly; the depth is the real cause.",
	"worse_answer": "Wrap the recursion in a try/except and hope it does not overflow.",
	},
	]


	def main() -> int:
	answerer_model = DEFAULT_ANSWERER_MODEL
	grader_model = DEFAULT_GRADER_MODEL
	if answerer_model == grader_model:
	print("WARNING: answerer and grader are the same model; pick different "
	"models to reduce self-grading bias (set EVAL_GRADER_MODEL).")

	print(f"Verifying models are live on OpenRouter...")
	status = verify_models(answerer_model, grader_model)
	for mid, ok in status.items():
	print(f" {mid}: {'OK' if ok is True else ok}")
	if not all(v is True for v in status.values()):
	print("\nOne or more model ids are not in the live catalog. Set "
	"EVAL_ANSWERER_MODEL / EVAL_GRADER_MODEL to current ids "
	"(see https://openrouter.ai/models) and re-run.")
	return 1

	answerer = make_completer(answerer_model, temperature=0.2, max_tokens=900)
	grader = make_completer(grader_model, system=GRADER_SYSTEM,
	temperature=0.0, max_tokens=300)

	print("\n--- Calibration (does the blind grader agree with human labels?) ---")
	cal = run_calibration(grader, CALIBRATION)
	print("grader agreement:", cal["agreement"])
	for row in cal["rows"]:
	print(f" agree={row['agree']} better={row['better']} worse={row['worse']} {row['task']}")

	print("\n--- Running skill-uplift eval ---")
	report = run_eval(CASES, answerer, grader, out_dir="./skill_eval_runs")

	print("\n========== RESULT ==========")
	print(f"answerer: {answerer_model} grader: {grader_model}")
	print(f"calibration: {cal['agreement']}")
	print(f"cases scored: {report['n_scored']} (excluded as leaked: {report['n_leaked_excluded']})")
	print(f"baseline (no skill): {report['baseline_no_skill_mean']}")
	print(f"with skill: {report['with_skill_mean']}")
	print(f"UPLIFT: {report['uplift']}")
	print(f"win / tie / loss: {report['wins']} / {report['ties']} / {report['losses']}")
	print("per-case:")
	for c in report["per_case"]:
	flag = " [LEAKED, excluded]" if c["leaked"] else ""
	print(f" {c['name']}: no={c['no_skill']} with={c['with_skill']} Δ={c['delta']}{flag}")
	print("\nraw generations saved to:", report["raw_saved_to"])
	print("\nReport the number above as-is. Modest or zero uplift is a valid result.")
	return 0


	if __name__ == "__main__":
	sys.exit(main())