File size: 5,170 Bytes
79e91df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""Measured model eval: validity / repair / category stats per granularity.

Runs the real ModelAdapter pipeline (prompt -> generate -> validate -> one
repair) against an API backend and reports the numbers the field notes quote:

    .venv/bin/python scripts/eval_quality.py hf_inference
    NEBIUS_API_KEY=... .venv/bin/python scripts/eval_quality.py nebius

The zerogpu prefill path can't run off-GPU; scripts/smoke_live.py covers it
end-to-end against the deployed Space instead.
"""

from __future__ import annotations

import json
import os
import sys
import time
from collections import Counter
from collections.abc import Callable
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))

from unstuck.model_adapter import ModelAdapter
from unstuck.schema import StepValidationError

TASKS = [
    "Clean my apartment before a friend visits tonight",
    "Start the first draft of a hackathon demo script",
    "Catch up on overdue email without losing the whole morning",
    "Prepare to call the dentist and book an appointment",
    "Make progress on a bug report that feels too vague to start",
    "Plan a small birthday dinner for four people",
    "Unpack and organise my desk after moving",
    "Write a cover letter for a job I actually want",
    "Sort out my tax documents before the deadline",
    "Practice guitar when I haven't touched it in a month",
    "Back up my laptop and phone properly",
    "Get back into running after three weeks off",
]
GRANULARITIES = ["chunky", "regular", "tiny"]
MAX_MINUTES = {"chunky": 25, "regular": 25, "tiny": 10}


def make_generate(backend: str) -> Callable[[str], str]:
    from huggingface_hub import InferenceClient

    temperature = float(os.environ.get("UNSTUCK_TEMPERATURE", "0"))
    if backend == "hf_inference":
        client = InferenceClient("Qwen/Qwen3-4B-Instruct-2507")
        model = None
    elif backend == "nebius":
        client = InferenceClient(
            base_url=os.environ.get(
                "NEBIUS_BASE_URL", "https://api.tokenfactory.nebius.com/v1/"
            ),
            api_key=os.environ["NEBIUS_API_KEY"],
        )
        model = os.environ.get("NEBIUS_MODEL", "Qwen/Qwen3-30B-A3B-Instruct-2507")
    else:
        raise SystemExit(f"unsupported backend for offline eval: {backend}")

    def generate(prompt: str) -> str:
        kwargs = {"model": model} if model else {}
        response = client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=512,
            temperature=temperature,
            **kwargs,
        )
        return str(response.choices[0].message.content)

    return generate


def main() -> int:
    backend = sys.argv[1] if len(sys.argv) > 1 else "hf_inference"
    base_generate = make_generate(backend)
    results = {}

    for granularity in GRANULARITIES:
        calls = 0

        def counting_generate(prompt: str) -> str:
            nonlocal calls
            calls += 1
            return base_generate(prompt)

        adapter = ModelAdapter(counting_generate, max_repairs=1)
        stats = {
            "tasks": 0,
            "valid": 0,
            "first_try": 0,
            "repaired": 0,
            "failed": 0,
            "steps": [],
            "minutes_violations": 0,
            "categories": Counter(),
            "seconds": 0.0,
        }
        for task in TASKS:
            calls = 0
            stats["tasks"] += 1
            t0 = time.time()
            try:
                steps = adapter.breakdown(task, granularity)
            except StepValidationError as exc:
                stats["failed"] += 1
                print(f"  {granularity} FAIL {task[:40]!r}: {exc}")
                continue
            finally:
                stats["seconds"] += time.time() - t0
            stats["valid"] += 1
            if calls == 1:
                stats["first_try"] += 1
            else:
                stats["repaired"] += 1
            stats["steps"].append(len(steps.steps))
            for step in steps.steps:
                stats["categories"][step.category] += 1
                if step.est_minutes > MAX_MINUTES[granularity]:
                    stats["minutes_violations"] += 1
        results[granularity] = stats

    print(f"\n== {backend} · temperature={os.environ.get('UNSTUCK_TEMPERATURE', '0')} ==")
    print("granularity  valid  first-try  repaired  failed  steps(avg)  >cap  s/task  categories")
    for granularity, s in results.items():
        n = s["tasks"]
        avg_steps = sum(s["steps"]) / len(s["steps"]) if s["steps"] else 0
        cats = ", ".join(f"{c}:{k}" for c, k in s["categories"].most_common())
        print(
            f"{granularity:<11}  {s['valid']}/{n:<4} {s['first_try']}/{n:<7}"
            f"  {s['repaired']:<8}  {s['failed']:<6}  {avg_steps:<10.1f}"
            f"  {s['minutes_violations']:<4}  {s['seconds'] / n:<6.1f}  {cats}"
        )
    print(json.dumps({g: {k: (dict(v) if isinstance(v, Counter) else v) for k, v in s.items()} for g, s in results.items()}, default=str))
    return 0


if __name__ == "__main__":
    sys.exit(main())