Spaces:

Torchflow1
/

Multi-Agent-Incident-Command-Center

Running

App Files Files Community

Multi-Agent-Incident-Command-Center / server /llm_remote.py

SwapnilPatil28

Upgrade 1 - Dashboard Update and new Incidents

540b82c verified 23 days ago

raw

history blame contribute delete

7.2 kB

	"""Thin client for calling a remote LLM from the FastAPI server.

	Used by the dashboard's "live inference" panel so a Hugging Face Space can
	delegate the expensive forward pass to a dedicated HF Inference Endpoint
	(GPU-backed) without loading the model inside the Space container.

	Two backends are supported:

	- ``chat`` (default) — OpenAI-compatible ``/v1/chat/completions`` endpoint.
	Hugging Face TGI-based Inference Endpoints expose this path, as do most
	vLLM deployments. This is the recommended setup.
	- ``generate`` — Raw TGI ``/generate`` endpoint. Useful when chat templating
	is already baked into the prompt and you just want raw text completion.

	Configuration via environment variables (set them as HF Space secrets):

	- ``LLM_ENDPOINT_URL`` — required to enable the panel. E.g.
	``https://abc.us-east-1.aws.endpoints.huggingface.cloud``. Without this
	env var, ``is_configured()`` returns ``False`` and the dashboard shows a
	setup hint instead of the demo.
	- ``HF_TOKEN`` — required. A Hugging Face token with ``read``
	scope over the model repo powering the endpoint.
	- ``LLM_ENDPOINT_MODE`` — optional, one of ``chat`` / ``generate``
	(default: ``chat``).
	- ``LLM_MODEL_ID`` — optional display / routing hint the endpoint
	sometimes cares about (default: ``"tgi"``).
	- ``LLM_MAX_NEW_TOKENS``— optional integer (default: ``160``).
	- ``LLM_TIMEOUT_S`` — optional integer (default: ``25``).

	The module uses only the Python stdlib (``urllib.request``) so it adds
	zero extra dependencies to the HF Space Docker image.
	"""

	from __future__ import annotations

	import json
	import logging
	import os
	import socket
	import urllib.error
	import urllib.request
	from dataclasses import dataclass
	from typing import Any, Dict, Optional

	_LOG = logging.getLogger("icc.llm_remote")


	@dataclass(frozen=True)
	class RemoteLLMConfig:
	endpoint_url: str
	token: str
	mode: str = "chat" # "chat" \| "generate"
	model_id: str = "tgi"
	max_new_tokens: int = 160
	timeout_s: int = 25

	@classmethod
	def from_env(cls) -> Optional["RemoteLLMConfig"]:
	url = os.environ.get("LLM_ENDPOINT_URL", "").strip()
	token = os.environ.get("HF_TOKEN", "").strip()
	if not url or not token:
	return None
	return cls(
	endpoint_url=url.rstrip("/"),
	token=token,
	mode=os.environ.get("LLM_ENDPOINT_MODE", "chat").strip().lower() or "chat",
	model_id=os.environ.get("LLM_MODEL_ID", "tgi").strip() or "tgi",
	max_new_tokens=int(os.environ.get("LLM_MAX_NEW_TOKENS", "160")),
	timeout_s=int(os.environ.get("LLM_TIMEOUT_S", "25")),
	)


	def is_configured() -> bool:
	"""Return True iff env vars required for remote inference are set."""
	return RemoteLLMConfig.from_env() is not None


	def status_summary() -> Dict[str, Any]:
	"""Lightweight status object for the dashboard to surface."""
	cfg = RemoteLLMConfig.from_env()
	if cfg is None:
	return {
	"configured": False,
	"reason": (
	"Set LLM_ENDPOINT_URL and HF_TOKEN as Space secrets to enable "
	"the live inference panel."
	),
	}
	return {
	"configured": True,
	"mode": cfg.mode,
	"model_id": cfg.model_id,
	"max_new_tokens": cfg.max_new_tokens,
	# Never surface the token; just confirm it is present.
	"token_present": bool(cfg.token),
	# Only expose the host (not the full URL, in case a query-string key
	# ever leaks into env by accident).
	"host": _safe_host(cfg.endpoint_url),
	}


	# ---------------------------------------------------------------------------
	# Internals
	# ---------------------------------------------------------------------------


	def _safe_host(url: str) -> str:
	try:
	return url.split("://", 1)[-1].split("/", 1)[0]
	except Exception:
	return "(unknown)"


	def _http_post(url: str, headers: Dict[str, str], body: bytes, timeout_s: int) -> str:
	req = urllib.request.Request(url, data=body, headers=headers, method="POST")
	try:
	with urllib.request.urlopen(req, timeout=timeout_s) as resp:
	return resp.read().decode("utf-8", errors="replace")
	except urllib.error.HTTPError as exc:
	raise RuntimeError(
	f"LLM endpoint returned HTTP {exc.code}: {exc.read().decode('utf-8', errors='replace')[:400]}"
	) from exc
	except (urllib.error.URLError, socket.timeout, TimeoutError) as exc:
	raise RuntimeError(f"LLM endpoint unreachable: {exc}") from exc


	def _call_chat(cfg: RemoteLLMConfig, prompt: str) -> str:
	url = f"{cfg.endpoint_url}/v1/chat/completions"
	payload = {
	"model": cfg.model_id,
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.0,
	"max_tokens": cfg.max_new_tokens,
	"stream": False,
	}
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {cfg.token}",
	}
	raw = _http_post(url, headers, json.dumps(payload).encode("utf-8"), cfg.timeout_s)
	try:
	data = json.loads(raw)
	except json.JSONDecodeError as exc:
	raise RuntimeError(f"LLM endpoint returned non-JSON: {raw[:400]}") from exc
	try:
	return data["choices"][0]["message"]["content"]
	except (KeyError, IndexError, TypeError) as exc:
	raise RuntimeError(f"Unexpected chat response shape: {raw[:400]}") from exc


	def _call_generate(cfg: RemoteLLMConfig, prompt: str) -> str:
	url = f"{cfg.endpoint_url}/generate"
	payload = {
	"inputs": prompt,
	"parameters": {
	"max_new_tokens": cfg.max_new_tokens,
	"temperature": 0.0,
	"do_sample": False,
	"return_full_text": False,
	},
	}
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {cfg.token}",
	}
	raw = _http_post(url, headers, json.dumps(payload).encode("utf-8"), cfg.timeout_s)
	try:
	data = json.loads(raw)
	except json.JSONDecodeError as exc:
	raise RuntimeError(f"LLM endpoint returned non-JSON: {raw[:400]}") from exc
	# TGI returns either {"generated_text": "..."} or a list of such objects.
	if isinstance(data, list) and data:
	data = data[0]
	if isinstance(data, dict) and "generated_text" in data:
	return str(data["generated_text"])
	raise RuntimeError(f"Unexpected /generate response shape: {raw[:400]}")


	def generate(prompt: str) -> str:
	"""Send ``prompt`` to the configured remote endpoint and return raw text.

	Raises RuntimeError with a human-readable message on any failure so the
	caller (the FastAPI demo endpoint) can surface it in the dashboard.
	"""
	cfg = RemoteLLMConfig.from_env()
	if cfg is None:
	raise RuntimeError(
	"Remote LLM not configured. Set LLM_ENDPOINT_URL and HF_TOKEN."
	)
	_LOG.info("Calling remote LLM %s mode=%s", _safe_host(cfg.endpoint_url), cfg.mode)
	if cfg.mode == "generate":
	return _call_generate(cfg, prompt)
	return _call_chat(cfg, prompt)