Spaces:
Running on Zero
Running on Zero
| """Measured model eval: validity / repair / category stats per granularity. | |
| Runs the real ModelAdapter pipeline (prompt -> generate -> validate -> one | |
| repair) against an API backend and reports the numbers the field notes quote: | |
| .venv/bin/python scripts/eval_quality.py hf_inference | |
| NEBIUS_API_KEY=... .venv/bin/python scripts/eval_quality.py nebius | |
| The zerogpu prefill path can't run off-GPU; scripts/smoke_live.py covers it | |
| end-to-end against the deployed Space instead. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from collections import Counter | |
| from collections.abc import Callable | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(ROOT / "src")) | |
| from unstuck.model_adapter import ModelAdapter | |
| from unstuck.schema import StepValidationError | |
| TASKS = [ | |
| "Clean my apartment before a friend visits tonight", | |
| "Start the first draft of a hackathon demo script", | |
| "Catch up on overdue email without losing the whole morning", | |
| "Prepare to call the dentist and book an appointment", | |
| "Make progress on a bug report that feels too vague to start", | |
| "Plan a small birthday dinner for four people", | |
| "Unpack and organise my desk after moving", | |
| "Write a cover letter for a job I actually want", | |
| "Sort out my tax documents before the deadline", | |
| "Practice guitar when I haven't touched it in a month", | |
| "Back up my laptop and phone properly", | |
| "Get back into running after three weeks off", | |
| ] | |
| GRANULARITIES = ["chunky", "regular", "tiny"] | |
| MAX_MINUTES = {"chunky": 25, "regular": 25, "tiny": 10} | |
| def make_generate(backend: str) -> Callable[[str], str]: | |
| from huggingface_hub import InferenceClient | |
| temperature = float(os.environ.get("UNSTUCK_TEMPERATURE", "0")) | |
| if backend == "hf_inference": | |
| client = InferenceClient("Qwen/Qwen3-4B-Instruct-2507") | |
| model = None | |
| elif backend == "nebius": | |
| client = InferenceClient( | |
| base_url=os.environ.get( | |
| "NEBIUS_BASE_URL", "https://api.tokenfactory.nebius.com/v1/" | |
| ), | |
| api_key=os.environ["NEBIUS_API_KEY"], | |
| ) | |
| model = os.environ.get("NEBIUS_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507") | |
| else: | |
| raise SystemExit(f"unsupported backend for offline eval: {backend}") | |
| def generate(prompt: str) -> str: | |
| kwargs = {"model": model} if model else {} | |
| response = client.chat_completion( | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=512, | |
| temperature=temperature, | |
| **kwargs, | |
| ) | |
| return str(response.choices[0].message.content) | |
| return generate | |
| def main() -> int: | |
| backend = sys.argv[1] if len(sys.argv) > 1 else "hf_inference" | |
| base_generate = make_generate(backend) | |
| results = {} | |
| for granularity in GRANULARITIES: | |
| calls = 0 | |
| def counting_generate(prompt: str) -> str: | |
| nonlocal calls | |
| calls += 1 | |
| return base_generate(prompt) | |
| adapter = ModelAdapter(counting_generate, max_repairs=1) | |
| stats = { | |
| "tasks": 0, | |
| "valid": 0, | |
| "first_try": 0, | |
| "repaired": 0, | |
| "failed": 0, | |
| "steps": [], | |
| "minutes_violations": 0, | |
| "categories": Counter(), | |
| "seconds": 0.0, | |
| } | |
| for task in TASKS: | |
| calls = 0 | |
| stats["tasks"] += 1 | |
| t0 = time.time() | |
| try: | |
| steps = adapter.breakdown(task, granularity) | |
| except StepValidationError as exc: | |
| stats["failed"] += 1 | |
| print(f" {granularity} FAIL {task[:40]!r}: {exc}") | |
| continue | |
| finally: | |
| stats["seconds"] += time.time() - t0 | |
| stats["valid"] += 1 | |
| if calls == 1: | |
| stats["first_try"] += 1 | |
| else: | |
| stats["repaired"] += 1 | |
| stats["steps"].append(len(steps.steps)) | |
| for step in steps.steps: | |
| stats["categories"][step.category] += 1 | |
| if step.est_minutes > MAX_MINUTES[granularity]: | |
| stats["minutes_violations"] += 1 | |
| results[granularity] = stats | |
| print(f"\n== {backend} · temperature={os.environ.get('UNSTUCK_TEMPERATURE', '0')} ==") | |
| print("granularity valid first-try repaired failed steps(avg) >cap s/task categories") | |
| for granularity, s in results.items(): | |
| n = s["tasks"] | |
| avg_steps = sum(s["steps"]) / len(s["steps"]) if s["steps"] else 0 | |
| cats = ", ".join(f"{c}:{k}" for c, k in s["categories"].most_common()) | |
| print( | |
| f"{granularity:<11} {s['valid']}/{n:<4} {s['first_try']}/{n:<7}" | |
| f" {s['repaired']:<8} {s['failed']:<6} {avg_steps:<10.1f}" | |
| f" {s['minutes_violations']:<4} {s['seconds'] / n:<6.1f} {cats}" | |
| ) | |
| print(json.dumps({g: {k: (dict(v) if isinstance(v, Counter) else v) for k, v in s.items()} for g, s in results.items()}, default=str)) | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |