Spaces:

build-small-hackathon
/

unstuck

Running on Zero

File size: 5,170 Bytes

79e91df

"""Measured model eval: validity / repair / category stats per granularity.

Runs the real ModelAdapter pipeline (prompt -> generate -> validate -> one
repair) against an API backend and reports the numbers the field notes quote:

    .venv/bin/python scripts/eval_quality.py hf_inference
    NEBIUS_API_KEY=... .venv/bin/python scripts/eval_quality.py nebius

The zerogpu prefill path can't run off-GPU; scripts/smoke_live.py covers it
end-to-end against the deployed Space instead.
"""

from __future__ import annotations

import json
import os
import sys
import time
from collections import Counter
from collections.abc import Callable
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))

from unstuck.model_adapter import ModelAdapter
from unstuck.schema import StepValidationError

TASKS = [
    "Clean my apartment before a friend visits tonight",
    "Start the first draft of a hackathon demo script",
    "Catch up on overdue email without losing the whole morning",
    "Prepare to call the dentist and book an appointment",
    "Make progress on a bug report that feels too vague to start",
    "Plan a small birthday dinner for four people",
    "Unpack and organise my desk after moving",
    "Write a cover letter for a job I actually want",
    "Sort out my tax documents before the deadline",
    "Practice guitar when I haven't touched it in a month",
    "Back up my laptop and phone properly",
    "Get back into running after three weeks off",
]
GRANULARITIES = ["chunky", "regular", "tiny"]
MAX_MINUTES = {"chunky": 25, "regular": 25, "tiny": 10}


def make_generate(backend: str) -> Callable[[str], str]:
    from huggingface_hub import InferenceClient

    temperature = float(os.environ.get("UNSTUCK_TEMPERATURE", "0"))
    if backend == "hf_inference":
        client = InferenceClient("Qwen/Qwen3-4B-Instruct-2507")
        model = None
    elif backend == "nebius":
        client = InferenceClient(
            base_url=os.environ.get(
                "NEBIUS_BASE_URL", "https://api.tokenfactory.nebius.com/v1/"
            ),
            api_key=os.environ["NEBIUS_API_KEY"],
        )
        model = os.environ.get("NEBIUS_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
    else:
        raise SystemExit(f"unsupported backend for offline eval: {backend}")

    def generate(prompt: str) -> str:
        kwargs = {"model": model} if model else {}
        response = client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=512,
            temperature=temperature,
            **kwargs,
        )
        return str(response.choices[0].message.content)

    return generate


def main() -> int:
    backend = sys.argv[1] if len(sys.argv) > 1 else "hf_inference"
    base_generate = make_generate(backend)
    results = {}

    for granularity in GRANULARITIES:
        calls = 0

        def counting_generate(prompt: str) -> str:
            nonlocal calls
            calls += 1
            return base_generate(prompt)

        adapter = ModelAdapter(counting_generate, max_repairs=1)
        stats = {
            "tasks": 0,
            "valid": 0,
            "first_try": 0,
            "repaired": 0,
            "failed": 0,
            "steps": [],
            "minutes_violations": 0,
            "categories": Counter(),
            "seconds": 0.0,
        }
        for task in TASKS:
            calls = 0
            stats["tasks"] += 1
            t0 = time.time()
            try:
                steps = adapter.breakdown(task, granularity)
            except StepValidationError as exc:
                stats["failed"] += 1
                print(f"  {granularity} FAIL {task[:40]!r}: {exc}")
                continue
            finally:
                stats["seconds"] += time.time() - t0
            stats["valid"] += 1
            if calls == 1:
                stats["first_try"] += 1
            else:
                stats["repaired"] += 1
            stats["steps"].append(len(steps.steps))
            for step in steps.steps:
                stats["categories"][step.category] += 1
                if step.est_minutes > MAX_MINUTES[granularity]:
                    stats["minutes_violations"] += 1
        results[granularity] = stats

    print(f"\n== {backend} · temperature={os.environ.get('UNSTUCK_TEMPERATURE', '0')} ==")
    print("granularity  valid  first-try  repaired  failed  steps(avg)  >cap  s/task  categories")
    for granularity, s in results.items():
        n = s["tasks"]
        avg_steps = sum(s["steps"]) / len(s["steps"]) if s["steps"] else 0
        cats = ", ".join(f"{c}:{k}" for c, k in s["categories"].most_common())
        print(
            f"{granularity:<11}  {s['valid']}/{n:<4} {s['first_try']}/{n:<7}"
            f"  {s['repaired']:<8}  {s['failed']:<6}  {avg_steps:<10.1f}"
            f"  {s['minutes_violations']:<4}  {s['seconds'] / n:<6.1f}  {cats}"
        )
    print(json.dumps({g: {k: (dict(v) if isinstance(v, Counter) else v) for k, v in s.items()} for g, s in results.items()}, default=str))
    return 0


if __name__ == "__main__":
    sys.exit(main())