Spaces:

build-small-hackathon
/

unstuck

Running on Zero

unstuck / scripts /bakeoff.py

feat: similar-task recall via Nebius embeddings (keyless; recall degrades off without NEBIUS_API_KEY)

79e91df verified 16 days ago

1.76 kB

	from __future__ import annotations

	import sys
	from collections.abc import Callable
	from pathlib import Path


	ROOT = Path(__file__).resolve().parents[1]
	sys.path.insert(0, str(ROOT / "src"))

	from unstuck.model_adapter import ModelAdapter
	from unstuck.schema import StepValidationError


	SAMPLE_TASKS = [
	"Clean my apartment before a friend visits tonight",
	"Start the first draft of a hackathon demo script",
	"Catch up on overdue email without losing the whole morning",
	"Prepare to call the dentist and book an appointment",
	"Make progress on a bug report that feels too vague to start",
	]

	MODELS = [
	"Qwen/Qwen3-4B-Instruct-2507",
	"openbmb/MiniCPM3-4B",
	"nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16",
	]


	def make_generate(model_id: str) -> Callable[[str], str]:
	"""Create a serverless HF chat generator for manual model bake-offs."""
	from huggingface_hub import InferenceClient

	client = InferenceClient(model_id)

	def generate(prompt: str) -> str:
	response = client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=512,
	temperature=0,
	)
	return str(response.choices[0].message.content)

	return generate


	def score(model_id: str) -> float:
	"""Return the fraction of sample tasks that produce validated step JSON."""
	adapter = ModelAdapter(make_generate(model_id), max_repairs=1)
	successes = 0

	for task in SAMPLE_TASKS:
	try:
	adapter.breakdown(task)
	except StepValidationError:
	continue
	successes += 1

	return successes / len(SAMPLE_TASKS)


	if __name__ == "__main__":
	for model_id in MODELS:
	print(f"{model_id}: {score(model_id):.0%}")