"""OpenAI-compatible streaming chat client. One client covers all three providers — they all speak the same protocol: - local llama-server (base_url=http://127.0.0.1:8080/v1, no key) - OpenRouter (base_url=https://openrouter.ai/api/v1) - NVIDIA NIM (base_url=https://integrate.api.nvidia.com/v1) """ from __future__ import annotations import json import httpx from .backend import Message DEFAULT_MODEL = "nvidia/nemotron-3-nano-30b-a3b:free" class OpenAIChatBackend: def __init__( self, base_url: str, model: str = DEFAULT_MODEL, api_key: str = "", *, thinking: bool = False, timeout: float = 30.0, client: httpx.AsyncClient | None = None, ): self.base_url = base_url.rstrip("/") self.model = model self.api_key = api_key self.thinking = thinking self._client = client or httpx.AsyncClient(timeout=timeout) async def stream( self, messages: list[Message], *, max_tokens=256, temperature=0.6, thinking: bool | None = None, ): headers = {"Content-Type": "application/json"} if self.api_key: headers["Authorization"] = f"Bearer {self.api_key}" payload = { "model": self.model, "messages": messages, "max_tokens": max_tokens, "temperature": temperature, "top_p": 0.95, "stream": True, # Nemotron 3: reasoning off by default for in-game latency; a # caller may enable it per call (the Warden thinking out loud). # The kwarg sits at the prompt's tail, so the cache stays warm. "chat_template_kwargs": { "enable_thinking": self.thinking if thinking is None else thinking }, } async with self._client.stream( "POST", f"{self.base_url}/chat/completions", json=payload, headers=headers ) as response: response.raise_for_status() async for line in response.aiter_lines(): if not line.startswith("data: "): continue data = line[len("data: "):] if data.strip() == "[DONE]": break try: delta = json.loads(data)["choices"][0]["delta"] except (json.JSONDecodeError, KeyError, IndexError): continue chunk = delta.get("content") if chunk: yield chunk async def aclose(self) -> None: await self._client.aclose()