Spaces:

build-small-hackathon
/

Scrypt

Running on Zero

App Files Files Community

Scrypt / scrypt /inference /api.py

IMJONEZZ

SCRYPT: initial commit — game, sandbox, Warden, Space web layer

9fca766 13 days ago

Raw

History Blame Contribute Delete

2.63 kB

	"""OpenAI-compatible streaming chat client.

	One client covers all three providers — they all speak the same protocol:
	- local llama-server (base_url=http://127.0.0.1:8080/v1, no key)
	- OpenRouter (base_url=https://openrouter.ai/api/v1)
	- NVIDIA NIM (base_url=https://integrate.api.nvidia.com/v1)
	"""

	from __future__ import annotations

	import json

	import httpx

	from .backend import Message

	DEFAULT_MODEL = "nvidia/nemotron-3-nano-30b-a3b:free"


	class OpenAIChatBackend:
	def __init__(
	self,
	base_url: str,
	model: str = DEFAULT_MODEL,
	api_key: str = "",
	*,
	thinking: bool = False,
	timeout: float = 30.0,
	client: httpx.AsyncClient \| None = None,
	):
	self.base_url = base_url.rstrip("/")
	self.model = model
	self.api_key = api_key
	self.thinking = thinking
	self._client = client or httpx.AsyncClient(timeout=timeout)

	async def stream(
	self, messages: list[Message], *, max_tokens=256, temperature=0.6,
	thinking: bool \| None = None,
	):
	headers = {"Content-Type": "application/json"}
	if self.api_key:
	headers["Authorization"] = f"Bearer {self.api_key}"
	payload = {
	"model": self.model,
	"messages": messages,
	"max_tokens": max_tokens,
	"temperature": temperature,
	"top_p": 0.95,
	"stream": True,
	# Nemotron 3: reasoning off by default for in-game latency; a
	# caller may enable it per call (the Warden thinking out loud).
	# The kwarg sits at the prompt's tail, so the cache stays warm.
	"chat_template_kwargs": {
	"enable_thinking": self.thinking if thinking is None else thinking
	},
	}
	async with self._client.stream(
	"POST", f"{self.base_url}/chat/completions", json=payload, headers=headers
	) as response:
	response.raise_for_status()
	async for line in response.aiter_lines():
	if not line.startswith("data: "):
	continue
	data = line[len("data: "):]
	if data.strip() == "[DONE]":
	break
	try:
	delta = json.loads(data)["choices"][0]["delta"]
	except (json.JSONDecodeError, KeyError, IndexError):
	continue
	chunk = delta.get("content")
	if chunk:
	yield chunk

	async def aclose(self) -> None:
	await self._client.aclose()