Scrypt / scrypt /inference /api.py
IMJONEZZ's picture
SCRYPT: initial commit — game, sandbox, Warden, Space web layer
9fca766
Raw
History Blame Contribute Delete
2.63 kB
"""OpenAI-compatible streaming chat client.
One client covers all three providers — they all speak the same protocol:
- local llama-server (base_url=http://127.0.0.1:8080/v1, no key)
- OpenRouter (base_url=https://openrouter.ai/api/v1)
- NVIDIA NIM (base_url=https://integrate.api.nvidia.com/v1)
"""
from __future__ import annotations
import json
import httpx
from .backend import Message
DEFAULT_MODEL = "nvidia/nemotron-3-nano-30b-a3b:free"
class OpenAIChatBackend:
def __init__(
self,
base_url: str,
model: str = DEFAULT_MODEL,
api_key: str = "",
*,
thinking: bool = False,
timeout: float = 30.0,
client: httpx.AsyncClient | None = None,
):
self.base_url = base_url.rstrip("/")
self.model = model
self.api_key = api_key
self.thinking = thinking
self._client = client or httpx.AsyncClient(timeout=timeout)
async def stream(
self, messages: list[Message], *, max_tokens=256, temperature=0.6,
thinking: bool | None = None,
):
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
payload = {
"model": self.model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": 0.95,
"stream": True,
# Nemotron 3: reasoning off by default for in-game latency; a
# caller may enable it per call (the Warden thinking out loud).
# The kwarg sits at the prompt's tail, so the cache stays warm.
"chat_template_kwargs": {
"enable_thinking": self.thinking if thinking is None else thinking
},
}
async with self._client.stream(
"POST", f"{self.base_url}/chat/completions", json=payload, headers=headers
) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line.startswith("data: "):
continue
data = line[len("data: "):]
if data.strip() == "[DONE]":
break
try:
delta = json.loads(data)["choices"][0]["delta"]
except (json.JSONDecodeError, KeyError, IndexError):
continue
chunk = delta.get("content")
if chunk:
yield chunk
async def aclose(self) -> None:
await self._client.aclose()