Spaces:
Running on Zero
Running on Zero
File size: 1,755 Bytes
79e91df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | from __future__ import annotations
import sys
from collections.abc import Callable
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))
from unstuck.model_adapter import ModelAdapter
from unstuck.schema import StepValidationError
SAMPLE_TASKS = [
"Clean my apartment before a friend visits tonight",
"Start the first draft of a hackathon demo script",
"Catch up on overdue email without losing the whole morning",
"Prepare to call the dentist and book an appointment",
"Make progress on a bug report that feels too vague to start",
]
MODELS = [
"Qwen/Qwen3-4B-Instruct-2507",
"openbmb/MiniCPM3-4B",
"nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16",
]
def make_generate(model_id: str) -> Callable[[str], str]:
"""Create a serverless HF chat generator for manual model bake-offs."""
from huggingface_hub import InferenceClient
client = InferenceClient(model_id)
def generate(prompt: str) -> str:
response = client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=512,
temperature=0,
)
return str(response.choices[0].message.content)
return generate
def score(model_id: str) -> float:
"""Return the fraction of sample tasks that produce validated step JSON."""
adapter = ModelAdapter(make_generate(model_id), max_repairs=1)
successes = 0
for task in SAMPLE_TASKS:
try:
adapter.breakdown(task)
except StepValidationError:
continue
successes += 1
return successes / len(SAMPLE_TASKS)
if __name__ == "__main__":
for model_id in MODELS:
print(f"{model_id}: {score(model_id):.0%}")
|