Spaces:
Running on Zero
Running on Zero
File size: 5,170 Bytes
79e91df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | """Measured model eval: validity / repair / category stats per granularity.
Runs the real ModelAdapter pipeline (prompt -> generate -> validate -> one
repair) against an API backend and reports the numbers the field notes quote:
.venv/bin/python scripts/eval_quality.py hf_inference
NEBIUS_API_KEY=... .venv/bin/python scripts/eval_quality.py nebius
The zerogpu prefill path can't run off-GPU; scripts/smoke_live.py covers it
end-to-end against the deployed Space instead.
"""
from __future__ import annotations
import json
import os
import sys
import time
from collections import Counter
from collections.abc import Callable
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))
from unstuck.model_adapter import ModelAdapter
from unstuck.schema import StepValidationError
TASKS = [
"Clean my apartment before a friend visits tonight",
"Start the first draft of a hackathon demo script",
"Catch up on overdue email without losing the whole morning",
"Prepare to call the dentist and book an appointment",
"Make progress on a bug report that feels too vague to start",
"Plan a small birthday dinner for four people",
"Unpack and organise my desk after moving",
"Write a cover letter for a job I actually want",
"Sort out my tax documents before the deadline",
"Practice guitar when I haven't touched it in a month",
"Back up my laptop and phone properly",
"Get back into running after three weeks off",
]
GRANULARITIES = ["chunky", "regular", "tiny"]
MAX_MINUTES = {"chunky": 25, "regular": 25, "tiny": 10}
def make_generate(backend: str) -> Callable[[str], str]:
from huggingface_hub import InferenceClient
temperature = float(os.environ.get("UNSTUCK_TEMPERATURE", "0"))
if backend == "hf_inference":
client = InferenceClient("Qwen/Qwen3-4B-Instruct-2507")
model = None
elif backend == "nebius":
client = InferenceClient(
base_url=os.environ.get(
"NEBIUS_BASE_URL", "https://api.tokenfactory.nebius.com/v1/"
),
api_key=os.environ["NEBIUS_API_KEY"],
)
model = os.environ.get("NEBIUS_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
else:
raise SystemExit(f"unsupported backend for offline eval: {backend}")
def generate(prompt: str) -> str:
kwargs = {"model": model} if model else {}
response = client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=512,
temperature=temperature,
**kwargs,
)
return str(response.choices[0].message.content)
return generate
def main() -> int:
backend = sys.argv[1] if len(sys.argv) > 1 else "hf_inference"
base_generate = make_generate(backend)
results = {}
for granularity in GRANULARITIES:
calls = 0
def counting_generate(prompt: str) -> str:
nonlocal calls
calls += 1
return base_generate(prompt)
adapter = ModelAdapter(counting_generate, max_repairs=1)
stats = {
"tasks": 0,
"valid": 0,
"first_try": 0,
"repaired": 0,
"failed": 0,
"steps": [],
"minutes_violations": 0,
"categories": Counter(),
"seconds": 0.0,
}
for task in TASKS:
calls = 0
stats["tasks"] += 1
t0 = time.time()
try:
steps = adapter.breakdown(task, granularity)
except StepValidationError as exc:
stats["failed"] += 1
print(f" {granularity} FAIL {task[:40]!r}: {exc}")
continue
finally:
stats["seconds"] += time.time() - t0
stats["valid"] += 1
if calls == 1:
stats["first_try"] += 1
else:
stats["repaired"] += 1
stats["steps"].append(len(steps.steps))
for step in steps.steps:
stats["categories"][step.category] += 1
if step.est_minutes > MAX_MINUTES[granularity]:
stats["minutes_violations"] += 1
results[granularity] = stats
print(f"\n== {backend} · temperature={os.environ.get('UNSTUCK_TEMPERATURE', '0')} ==")
print("granularity valid first-try repaired failed steps(avg) >cap s/task categories")
for granularity, s in results.items():
n = s["tasks"]
avg_steps = sum(s["steps"]) / len(s["steps"]) if s["steps"] else 0
cats = ", ".join(f"{c}:{k}" for c, k in s["categories"].most_common())
print(
f"{granularity:<11} {s['valid']}/{n:<4} {s['first_try']}/{n:<7}"
f" {s['repaired']:<8} {s['failed']:<6} {avg_steps:<10.1f}"
f" {s['minutes_violations']:<4} {s['seconds'] / n:<6.1f} {cats}"
)
print(json.dumps({g: {k: (dict(v) if isinstance(v, Counter) else v) for k, v in s.items()} for g, s in results.items()}, default=str))
return 0
if __name__ == "__main__":
sys.exit(main())
|