File size: 2,632 Bytes
9fca766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""OpenAI-compatible streaming chat client.

One client covers all three providers — they all speak the same protocol:
- local llama-server   (base_url=http://127.0.0.1:8080/v1, no key)
- OpenRouter           (base_url=https://openrouter.ai/api/v1)
- NVIDIA NIM           (base_url=https://integrate.api.nvidia.com/v1)
"""

from __future__ import annotations

import json

import httpx

from .backend import Message

DEFAULT_MODEL = "nvidia/nemotron-3-nano-30b-a3b:free"


class OpenAIChatBackend:
    def __init__(
        self,
        base_url: str,
        model: str = DEFAULT_MODEL,
        api_key: str = "",
        *,
        thinking: bool = False,
        timeout: float = 30.0,
        client: httpx.AsyncClient | None = None,
    ):
        self.base_url = base_url.rstrip("/")
        self.model = model
        self.api_key = api_key
        self.thinking = thinking
        self._client = client or httpx.AsyncClient(timeout=timeout)

    async def stream(
        self, messages: list[Message], *, max_tokens=256, temperature=0.6,
        thinking: bool | None = None,
    ):
        headers = {"Content-Type": "application/json"}
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"
        payload = {
            "model": self.model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": 0.95,
            "stream": True,
            # Nemotron 3: reasoning off by default for in-game latency; a
            # caller may enable it per call (the Warden thinking out loud).
            # The kwarg sits at the prompt's tail, so the cache stays warm.
            "chat_template_kwargs": {
                "enable_thinking": self.thinking if thinking is None else thinking
            },
        }
        async with self._client.stream(
            "POST", f"{self.base_url}/chat/completions", json=payload, headers=headers
        ) as response:
            response.raise_for_status()
            async for line in response.aiter_lines():
                if not line.startswith("data: "):
                    continue
                data = line[len("data: "):]
                if data.strip() == "[DONE]":
                    break
                try:
                    delta = json.loads(data)["choices"][0]["delta"]
                except (json.JSONDecodeError, KeyError, IndexError):
                    continue
                chunk = delta.get("content")
                if chunk:
                    yield chunk

    async def aclose(self) -> None:
        await self._client.aclose()