Spaces:
Running on Zero
Running on Zero
| """OpenAI-compatible streaming chat client. | |
| One client covers all three providers — they all speak the same protocol: | |
| - local llama-server (base_url=http://127.0.0.1:8080/v1, no key) | |
| - OpenRouter (base_url=https://openrouter.ai/api/v1) | |
| - NVIDIA NIM (base_url=https://integrate.api.nvidia.com/v1) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import httpx | |
| from .backend import Message | |
| DEFAULT_MODEL = "nvidia/nemotron-3-nano-30b-a3b:free" | |
| class OpenAIChatBackend: | |
| def __init__( | |
| self, | |
| base_url: str, | |
| model: str = DEFAULT_MODEL, | |
| api_key: str = "", | |
| *, | |
| thinking: bool = False, | |
| timeout: float = 30.0, | |
| client: httpx.AsyncClient | None = None, | |
| ): | |
| self.base_url = base_url.rstrip("/") | |
| self.model = model | |
| self.api_key = api_key | |
| self.thinking = thinking | |
| self._client = client or httpx.AsyncClient(timeout=timeout) | |
| async def stream( | |
| self, messages: list[Message], *, max_tokens=256, temperature=0.6, | |
| thinking: bool | None = None, | |
| ): | |
| headers = {"Content-Type": "application/json"} | |
| if self.api_key: | |
| headers["Authorization"] = f"Bearer {self.api_key}" | |
| payload = { | |
| "model": self.model, | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| "top_p": 0.95, | |
| "stream": True, | |
| # Nemotron 3: reasoning off by default for in-game latency; a | |
| # caller may enable it per call (the Warden thinking out loud). | |
| # The kwarg sits at the prompt's tail, so the cache stays warm. | |
| "chat_template_kwargs": { | |
| "enable_thinking": self.thinking if thinking is None else thinking | |
| }, | |
| } | |
| async with self._client.stream( | |
| "POST", f"{self.base_url}/chat/completions", json=payload, headers=headers | |
| ) as response: | |
| response.raise_for_status() | |
| async for line in response.aiter_lines(): | |
| if not line.startswith("data: "): | |
| continue | |
| data = line[len("data: "):] | |
| if data.strip() == "[DONE]": | |
| break | |
| try: | |
| delta = json.loads(data)["choices"][0]["delta"] | |
| except (json.JSONDecodeError, KeyError, IndexError): | |
| continue | |
| chunk = delta.get("content") | |
| if chunk: | |
| yield chunk | |
| async def aclose(self) -> None: | |
| await self._client.aclose() | |