SwapnilPatil28's picture
Upgrade 1 - Dashboard Update and new Incidents
540b82c verified
"""Thin client for calling a remote LLM from the FastAPI server.
Used by the dashboard's "live inference" panel so a Hugging Face Space can
delegate the expensive forward pass to a dedicated HF Inference Endpoint
(GPU-backed) without loading the model inside the Space container.
Two backends are supported:
- ``chat`` (default) — OpenAI-compatible ``/v1/chat/completions`` endpoint.
Hugging Face TGI-based Inference Endpoints expose this path, as do most
vLLM deployments. This is the recommended setup.
- ``generate`` — Raw TGI ``/generate`` endpoint. Useful when chat templating
is already baked into the prompt and you just want raw text completion.
Configuration via environment variables (set them as HF Space secrets):
- ``LLM_ENDPOINT_URL`` — **required** to enable the panel. E.g.
``https://abc.us-east-1.aws.endpoints.huggingface.cloud``. Without this
env var, ``is_configured()`` returns ``False`` and the dashboard shows a
setup hint instead of the demo.
- ``HF_TOKEN`` — **required**. A Hugging Face token with ``read``
scope over the model repo powering the endpoint.
- ``LLM_ENDPOINT_MODE`` — optional, one of ``chat`` / ``generate``
(default: ``chat``).
- ``LLM_MODEL_ID`` — optional display / routing hint the endpoint
sometimes cares about (default: ``"tgi"``).
- ``LLM_MAX_NEW_TOKENS``— optional integer (default: ``160``).
- ``LLM_TIMEOUT_S`` — optional integer (default: ``25``).
The module uses only the Python stdlib (``urllib.request``) so it adds
zero extra dependencies to the HF Space Docker image.
"""
from __future__ import annotations
import json
import logging
import os
import socket
import urllib.error
import urllib.request
from dataclasses import dataclass
from typing import Any, Dict, Optional
_LOG = logging.getLogger("icc.llm_remote")
@dataclass(frozen=True)
class RemoteLLMConfig:
endpoint_url: str
token: str
mode: str = "chat" # "chat" | "generate"
model_id: str = "tgi"
max_new_tokens: int = 160
timeout_s: int = 25
@classmethod
def from_env(cls) -> Optional["RemoteLLMConfig"]:
url = os.environ.get("LLM_ENDPOINT_URL", "").strip()
token = os.environ.get("HF_TOKEN", "").strip()
if not url or not token:
return None
return cls(
endpoint_url=url.rstrip("/"),
token=token,
mode=os.environ.get("LLM_ENDPOINT_MODE", "chat").strip().lower() or "chat",
model_id=os.environ.get("LLM_MODEL_ID", "tgi").strip() or "tgi",
max_new_tokens=int(os.environ.get("LLM_MAX_NEW_TOKENS", "160")),
timeout_s=int(os.environ.get("LLM_TIMEOUT_S", "25")),
)
def is_configured() -> bool:
"""Return True iff env vars required for remote inference are set."""
return RemoteLLMConfig.from_env() is not None
def status_summary() -> Dict[str, Any]:
"""Lightweight status object for the dashboard to surface."""
cfg = RemoteLLMConfig.from_env()
if cfg is None:
return {
"configured": False,
"reason": (
"Set LLM_ENDPOINT_URL and HF_TOKEN as Space secrets to enable "
"the live inference panel."
),
}
return {
"configured": True,
"mode": cfg.mode,
"model_id": cfg.model_id,
"max_new_tokens": cfg.max_new_tokens,
# Never surface the token; just confirm it is present.
"token_present": bool(cfg.token),
# Only expose the host (not the full URL, in case a query-string key
# ever leaks into env by accident).
"host": _safe_host(cfg.endpoint_url),
}
# ---------------------------------------------------------------------------
# Internals
# ---------------------------------------------------------------------------
def _safe_host(url: str) -> str:
try:
return url.split("://", 1)[-1].split("/", 1)[0]
except Exception:
return "(unknown)"
def _http_post(url: str, headers: Dict[str, str], body: bytes, timeout_s: int) -> str:
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
try:
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
return resp.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as exc:
raise RuntimeError(
f"LLM endpoint returned HTTP {exc.code}: {exc.read().decode('utf-8', errors='replace')[:400]}"
) from exc
except (urllib.error.URLError, socket.timeout, TimeoutError) as exc:
raise RuntimeError(f"LLM endpoint unreachable: {exc}") from exc
def _call_chat(cfg: RemoteLLMConfig, prompt: str) -> str:
url = f"{cfg.endpoint_url}/v1/chat/completions"
payload = {
"model": cfg.model_id,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.0,
"max_tokens": cfg.max_new_tokens,
"stream": False,
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {cfg.token}",
}
raw = _http_post(url, headers, json.dumps(payload).encode("utf-8"), cfg.timeout_s)
try:
data = json.loads(raw)
except json.JSONDecodeError as exc:
raise RuntimeError(f"LLM endpoint returned non-JSON: {raw[:400]}") from exc
try:
return data["choices"][0]["message"]["content"]
except (KeyError, IndexError, TypeError) as exc:
raise RuntimeError(f"Unexpected chat response shape: {raw[:400]}") from exc
def _call_generate(cfg: RemoteLLMConfig, prompt: str) -> str:
url = f"{cfg.endpoint_url}/generate"
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": cfg.max_new_tokens,
"temperature": 0.0,
"do_sample": False,
"return_full_text": False,
},
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {cfg.token}",
}
raw = _http_post(url, headers, json.dumps(payload).encode("utf-8"), cfg.timeout_s)
try:
data = json.loads(raw)
except json.JSONDecodeError as exc:
raise RuntimeError(f"LLM endpoint returned non-JSON: {raw[:400]}") from exc
# TGI returns either {"generated_text": "..."} or a list of such objects.
if isinstance(data, list) and data:
data = data[0]
if isinstance(data, dict) and "generated_text" in data:
return str(data["generated_text"])
raise RuntimeError(f"Unexpected /generate response shape: {raw[:400]}")
def generate(prompt: str) -> str:
"""Send ``prompt`` to the configured remote endpoint and return raw text.
Raises RuntimeError with a human-readable message on any failure so the
caller (the FastAPI demo endpoint) can surface it in the dashboard.
"""
cfg = RemoteLLMConfig.from_env()
if cfg is None:
raise RuntimeError(
"Remote LLM not configured. Set LLM_ENDPOINT_URL and HF_TOKEN."
)
_LOG.info("Calling remote LLM %s mode=%s", _safe_host(cfg.endpoint_url), cfg.mode)
if cfg.mode == "generate":
return _call_generate(cfg, prompt)
return _call_chat(cfg, prompt)