Spaces:
Running on Zero
Running on Zero
File size: 2,632 Bytes
9fca766 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | """OpenAI-compatible streaming chat client.
One client covers all three providers — they all speak the same protocol:
- local llama-server (base_url=http://127.0.0.1:8080/v1, no key)
- OpenRouter (base_url=https://openrouter.ai/api/v1)
- NVIDIA NIM (base_url=https://integrate.api.nvidia.com/v1)
"""
from __future__ import annotations
import json
import httpx
from .backend import Message
DEFAULT_MODEL = "nvidia/nemotron-3-nano-30b-a3b:free"
class OpenAIChatBackend:
def __init__(
self,
base_url: str,
model: str = DEFAULT_MODEL,
api_key: str = "",
*,
thinking: bool = False,
timeout: float = 30.0,
client: httpx.AsyncClient | None = None,
):
self.base_url = base_url.rstrip("/")
self.model = model
self.api_key = api_key
self.thinking = thinking
self._client = client or httpx.AsyncClient(timeout=timeout)
async def stream(
self, messages: list[Message], *, max_tokens=256, temperature=0.6,
thinking: bool | None = None,
):
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
payload = {
"model": self.model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": 0.95,
"stream": True,
# Nemotron 3: reasoning off by default for in-game latency; a
# caller may enable it per call (the Warden thinking out loud).
# The kwarg sits at the prompt's tail, so the cache stays warm.
"chat_template_kwargs": {
"enable_thinking": self.thinking if thinking is None else thinking
},
}
async with self._client.stream(
"POST", f"{self.base_url}/chat/completions", json=payload, headers=headers
) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if not line.startswith("data: "):
continue
data = line[len("data: "):]
if data.strip() == "[DONE]":
break
try:
delta = json.loads(data)["choices"][0]["delta"]
except (json.JSONDecodeError, KeyError, IndexError):
continue
chunk = delta.get("content")
if chunk:
yield chunk
async def aclose(self) -> None:
await self._client.aclose()
|