Spaces:

build-small-hackathon
/

unstuck

Running on Zero

App Files Files Community

unstuck / scripts /eval_quality.py

art87able

feat: similar-task recall via Nebius embeddings (keyless; recall degrades off without NEBIUS_API_KEY)

79e91df verified 13 days ago

Raw

History Blame Contribute Delete

5.17 kB

	"""Measured model eval: validity / repair / category stats per granularity.

	Runs the real ModelAdapter pipeline (prompt -> generate -> validate -> one
	repair) against an API backend and reports the numbers the field notes quote:

	.venv/bin/python scripts/eval_quality.py hf_inference
	NEBIUS_API_KEY=... .venv/bin/python scripts/eval_quality.py nebius

	The zerogpu prefill path can't run off-GPU; scripts/smoke_live.py covers it
	end-to-end against the deployed Space instead.
	"""

	from __future__ import annotations

	import json
	import os
	import sys
	import time
	from collections import Counter
	from collections.abc import Callable
	from pathlib import Path

	ROOT = Path(__file__).resolve().parents[1]
	sys.path.insert(0, str(ROOT / "src"))

	from unstuck.model_adapter import ModelAdapter
	from unstuck.schema import StepValidationError

	TASKS = [
	"Clean my apartment before a friend visits tonight",
	"Start the first draft of a hackathon demo script",
	"Catch up on overdue email without losing the whole morning",
	"Prepare to call the dentist and book an appointment",
	"Make progress on a bug report that feels too vague to start",
	"Plan a small birthday dinner for four people",
	"Unpack and organise my desk after moving",
	"Write a cover letter for a job I actually want",
	"Sort out my tax documents before the deadline",
	"Practice guitar when I haven't touched it in a month",
	"Back up my laptop and phone properly",
	"Get back into running after three weeks off",
	]
	GRANULARITIES = ["chunky", "regular", "tiny"]
	MAX_MINUTES = {"chunky": 25, "regular": 25, "tiny": 10}


	def make_generate(backend: str) -> Callable[[str], str]:
	from huggingface_hub import InferenceClient

	temperature = float(os.environ.get("UNSTUCK_TEMPERATURE", "0"))
	if backend == "hf_inference":
	client = InferenceClient("Qwen/Qwen3-4B-Instruct-2507")
	model = None
	elif backend == "nebius":
	client = InferenceClient(
	base_url=os.environ.get(
	"NEBIUS_BASE_URL", "https://api.tokenfactory.nebius.com/v1/"
	),
	api_key=os.environ["NEBIUS_API_KEY"],
	)
	model = os.environ.get("NEBIUS_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
	else:
	raise SystemExit(f"unsupported backend for offline eval: {backend}")

	def generate(prompt: str) -> str:
	kwargs = {"model": model} if model else {}
	response = client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=512,
	temperature=temperature,
	**kwargs,
	)
	return str(response.choices[0].message.content)

	return generate


	def main() -> int:
	backend = sys.argv[1] if len(sys.argv) > 1 else "hf_inference"
	base_generate = make_generate(backend)
	results = {}

	for granularity in GRANULARITIES:
	calls = 0

	def counting_generate(prompt: str) -> str:
	nonlocal calls
	calls += 1
	return base_generate(prompt)

	adapter = ModelAdapter(counting_generate, max_repairs=1)
	stats = {
	"tasks": 0,
	"valid": 0,
	"first_try": 0,
	"repaired": 0,
	"failed": 0,
	"steps": [],
	"minutes_violations": 0,
	"categories": Counter(),
	"seconds": 0.0,
	}
	for task in TASKS:
	calls = 0
	stats["tasks"] += 1
	t0 = time.time()
	try:
	steps = adapter.breakdown(task, granularity)
	except StepValidationError as exc:
	stats["failed"] += 1
	print(f" {granularity} FAIL {task[:40]!r}: {exc}")
	continue
	finally:
	stats["seconds"] += time.time() - t0
	stats["valid"] += 1
	if calls == 1:
	stats["first_try"] += 1
	else:
	stats["repaired"] += 1
	stats["steps"].append(len(steps.steps))
	for step in steps.steps:
	stats["categories"][step.category] += 1
	if step.est_minutes > MAX_MINUTES[granularity]:
	stats["minutes_violations"] += 1
	results[granularity] = stats

	print(f"\n== {backend} · temperature={os.environ.get('UNSTUCK_TEMPERATURE', '0')} ==")
	print("granularity valid first-try repaired failed steps(avg) >cap s/task categories")
	for granularity, s in results.items():
	n = s["tasks"]
	avg_steps = sum(s["steps"]) / len(s["steps"]) if s["steps"] else 0
	cats = ", ".join(f"{c}:{k}" for c, k in s["categories"].most_common())
	print(
	f"{granularity:<11} {s['valid']}/{n:<4} {s['first_try']}/{n:<7}"
	f" {s['repaired']:<8} {s['failed']:<6} {avg_steps:<10.1f}"
	f" {s['minutes_violations']:<4} {s['seconds'] / n:<6.1f} {cats}"
	)
	print(json.dumps({g: {k: (dict(v) if isinstance(v, Counter) else v) for k, v in s.items()} for g, s in results.items()}, default=str))
	return 0


	if __name__ == "__main__":
	sys.exit(main())