Spaces:

Adarshu07
/

Sec

Running

App Files Files Community

Adarshu07 commited on 1 day ago

Commit

209e6a4

verified ·

1 Parent(s): d07254b

Create app.py

Browse files

Files changed (1) hide show

app.py +718 -0

app.py ADDED Viewed

	@@ -0,0 +1,718 @@

+"""
+══════════════════════════════════════════════════════════════════
+  ⚡  DevsDo API Server  v1.0.0
+  OpenAI-compatible · 52 Models · Cloudflare AI Backend
+  SSE Streaming  ·  <think> Reasoning  ·  Zero API Keys
+  Sections
+  ────────
+   §1  Logging
+   §2  Model Registry  (g4f-style)
+   §3  Register All 52 Models
+   §4  Think-Tag Stream Parser
+   §5  Backend Client  (SSE → raw tokens)
+   §6  FastAPI App + Lifespan
+   §7  Pydantic Schemas
+   §8  Routes
+   §9  Stream Generator  (tokens → OpenAI SSE)
+   §10 Non-Stream Collector
+   §11 Entrypoint
+══════════════════════════════════════════════════════════════════
+"""
+from __future__ import annotations
+import json, time, uuid, asyncio, random, logging
+from contextlib import asynccontextmanager
+from dataclasses import dataclass, asdict
+from typing import Optional, AsyncGenerator, Dict, List, Any
+import aiohttp
+import aiohttp.resolver
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+# ═══════════════════════════════════════════════════════════
+# §1 — LOGGING
+# ═══════════════════════════════════════════════════════════
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s │ %(levelname)-7s │ %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger("devsdo")
+# ═══════════════════════════════════════════════════════════
+# §2 — MODEL REGISTRY  (g4f-style)
+#
+#  Each model carries:
+#    name       – short route alias     ("deepseek-r1")
+#    real_name  – human display name    ("DeepSeek R1 Distill Qwen 32B")
+#    author     – organisation          ("DeepSeek")
+#    family     – model family group    ("DeepSeek")
+#    model_id   – backend @cf/@hf ID   ("@cf/deepseek-ai/…")
+# ═══════════════════════════════════════════════════════════
+@dataclass(frozen=True, slots=True)
+class ModelCard:
+    name:      str
+    real_name: str
+    author:    str
+    family:    str
+    model_id:  str
+class Registry:
+    """Central model store — register once, resolve anywhere."""
+    _by_name: Dict[str, ModelCard] = {}
+    _by_id:   Dict[str, ModelCard] = {}
+    _default: str = ""
+    # ── mutators ──────────────────────────────────────
+    @classmethod
+    def add(cls, *cards: ModelCard):
+        for c in cards:
+            cls._by_name[c.name] = c
+            cls._by_id[c.model_id] = c
+            if not cls._default:
+                cls._default = c.name
+    # ── lookups ───────────────────────────────────────
+    @classmethod
+    def resolve(cls, raw: Optional[str]) -> str:
+        """Alias / full-id / fuzzy  →  backend model_id."""
+        if not raw:
+            return cls._by_name[cls._default].model_id
+        raw = raw.strip()
+        for pfx in ("devsdo/", "devsdo:", "cloudflare/", "cf/"):
+            if raw.lower().startswith(pfx):
+                raw = raw[len(pfx):]
+                break
+        if raw.startswith(("@cf/", "@hf/")):
+            return raw
+        if raw in cls._by_name:
+            return cls._by_name[raw].model_id
+        low = raw.lower()
+        for alias, card in cls._by_name.items():
+            if low in alias or low in card.model_id.lower():
+                return card.model_id
+        return raw                      # pass-through
+    @classmethod
+    def find(cls, raw: str) -> Optional[ModelCard]:
+        mid = cls.resolve(raw)
+        return cls._by_id.get(mid) or cls._by_name.get(raw)
+    @classmethod
+    def all_cards(cls) -> List[ModelCard]:
+        return list(cls._by_name.values())
+    # ── serialisers ───────────────────────────────────
+    @classmethod
+    def openai_list(cls) -> dict:
+        """GET /v1/models — OpenAI-compatible."""
+        return {
+            "object": "list",
+            "data": [
+                {
+                    "id":       c.name,
+                    "object":   "model",
+                    "created":  1700000000,
+                    "owned_by": c.author.lower().replace(" ", "-"),
+                }
+                for c in cls._by_name.values()
+            ],
+        }
+    @classmethod
+    def internal_list(cls) -> dict:
+        """GET /api/internal/v1/models — rich, grouped by family."""
+        fam: Dict[str, list] = {}
+        for c in cls._by_name.values():
+            fam.setdefault(c.family, []).append(
+                {
+                    "id":         c.name,
+                    "name":       c.real_name,
+                    "author":     c.author,
+                    "backend_id": c.model_id,
+                }
+            )
+        return {
+            "server":    "DevsDo API",
+            "version":   "1.0.0",
+            "timestamp": int(time.time()),
+            "total":     len(cls._by_name),
+            "families": [
+                {"family": fn, "count": len(ms), "models": ms}
+                for fn, ms in fam.items()
+            ],
+        }
+# ═══════════════════════════════════════════════════════════
+# §3 — REGISTER ALL 52 MODELS
+# ═══════════════════════════════════════════════════════════
+Registry.add(
+    # ─── Flagship / Large ─────────────────────────────────
+    ModelCard("kimi-k2.5",        "Kimi K2.5",                   "Moonshot AI",   "Kimi",      "@cf/moonshotai/kimi-k2.5"),
+    ModelCard("nemotron-120b",    "Nemotron 3 120B A12B",        "NVIDIA",        "Nemotron",  "@cf/nvidia/nemotron-3-120b-a12b"),
+    ModelCard("gpt-oss-120b",     "GPT-OSS 120B",                "OpenAI",        "GPT-OSS",   "@cf/openai/gpt-oss-120b"),
+    ModelCard("gpt-oss-20b",      "GPT-OSS 20B",                 "OpenAI",        "GPT-OSS",   "@cf/openai/gpt-oss-20b"),
+    ModelCard("llama-3.3-70b",    "LLaMA 3.3 70B Instruct FP8",  "Meta",         "LLaMA",     "@cf/meta/llama-3.3-70b-instruct-fp8-fast"),
+    # ─── Meta LLaMA ───────────────────────────────────────
+    ModelCard("llama-4-scout",       "LLaMA 4 Scout 17B 16E",      "Meta", "LLaMA", "@cf/meta/llama-4-scout-17b-16e-instruct"),
+    ModelCard("llama-3.2-11b-vision","LLaMA 3.2 11B Vision",       "Meta", "LLaMA", "@cf/meta/llama-3.2-11b-vision-instruct"),
+    ModelCard("llama-3.1-8b",        "LLaMA 3.1 8B Fast",          "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fast"),
+    ModelCard("llama-3.1-8b-fp8",    "LLaMA 3.1 8B FP8",           "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fp8"),
+    ModelCard("llama-3.1-8b-awq",    "LLaMA 3.1 8B AWQ",           "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-awq"),
+    ModelCard("llama-3.2-3b",        "LLaMA 3.2 3B",               "Meta", "LLaMA", "@cf/meta/llama-3.2-3b-instruct"),
+    ModelCard("llama-3.2-1b",        "LLaMA 3.2 1B",               "Meta", "LLaMA", "@cf/meta/llama-3.2-1b-instruct"),
+    ModelCard("llama-3-8b",          "LLaMA 3 8B",                  "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct"),
+    ModelCard("llama-3-8b-awq",      "LLaMA 3 8B AWQ",             "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct-awq"),
+    ModelCard("llama-guard-3",       "LLaMA Guard 3 8B",            "Meta", "LLaMA", "@cf/meta/llama-guard-3-8b"),
+    ModelCard("llama-2-7b-fp16",     "LLaMA 2 7B FP16",            "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-fp16"),
+    ModelCard("llama-2-7b-int8",     "LLaMA 2 7B INT8",            "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-int8"),
+    ModelCard("llama-2-7b-lora",     "LLaMA 2 7B LoRA",            "Meta", "LLaMA", "@cf/meta-llama/llama-2-7b-chat-hf-lora"),
+    ModelCard("llama-2-13b",         "LLaMA 2 13B AWQ",            "Meta", "LLaMA", "@hf/thebloke/llama-2-13b-chat-awq"),
+    # ─── Qwen ─────────────────────────────────────────────
+    ModelCard("qwq-32b",          "QwQ 32B",                "Qwen", "Qwen", "@cf/qwen/qwq-32b"),
+    ModelCard("qwen-coder-32b",   "Qwen 2.5 Coder 32B",    "Qwen", "Qwen", "@cf/qwen/qwen2.5-coder-32b-instruct"),
+    ModelCard("qwen3-30b",        "Qwen 3 30B A3B FP8",    "Qwen", "Qwen", "@cf/qwen/qwen3-30b-a3b-fp8"),
+    ModelCard("qwen1.5-14b",      "Qwen 1.5 14B AWQ",      "Qwen", "Qwen", "@cf/qwen/qwen1.5-14b-chat-awq"),
+    ModelCard("qwen1.5-7b",       "Qwen 1.5 7B AWQ",       "Qwen", "Qwen", "@cf/qwen/qwen1.5-7b-chat-awq"),
+    ModelCard("qwen1.5-1.8b",     "Qwen 1.5 1.8B",         "Qwen", "Qwen", "@cf/qwen/qwen1.5-1.8b-chat"),
+    ModelCard("qwen1.5-0.5b",     "Qwen 1.5 0.5B",         "Qwen", "Qwen", "@cf/qwen/qwen1.5-0.5b-chat"),
+    # ─── DeepSeek ──────────────────────────────────────���──
+    ModelCard("deepseek-r1",          "DeepSeek R1 Distill Qwen 32B",   "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b"),
+    ModelCard("deepseek-math",        "DeepSeek Math 7B",               "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-math-7b-instruct"),
+    ModelCard("deepseek-coder-base",  "DeepSeek Coder 6.7B Base",       "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-base-awq"),
+    ModelCard("deepseek-coder",       "DeepSeek Coder 6.7B Instruct",   "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-instruct-awq"),
+    # ─── Google Gemma ─────────────────────────────────────
+    ModelCard("gemma-3-12b",    "Gemma 3 12B IT",       "Google", "Gemma", "@cf/google/gemma-3-12b-it"),
+    ModelCard("gemma-7b",       "Gemma 7B IT",          "Google", "Gemma", "@hf/google/gemma-7b-it"),
+    ModelCard("gemma-2b-lora",  "Gemma 2B IT LoRA",     "Google", "Gemma", "@cf/google/gemma-2b-it-lora"),
+    ModelCard("gemma-7b-lora",  "Gemma 7B IT LoRA",     "Google", "Gemma", "@cf/google/gemma-7b-it-lora"),
+    # ─── Mistral ──────────────────────────────────────────
+    ModelCard("mistral-small-3.1", "Mistral Small 3.1 24B",  "Mistral AI", "Mistral", "@cf/mistralai/mistral-small-3.1-24b-instruct"),
+    ModelCard("mistral-v0.2",      "Mistral 7B v0.2",        "Mistral AI", "Mistral", "@hf/mistral/mistral-7b-instruct-v0.2"),
+    ModelCard("mistral-v0.2-lora", "Mistral 7B v0.2 LoRA",   "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.2-lora"),
+    ModelCard("mistral-v0.1",      "Mistral 7B v0.1",        "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.1"),
+    ModelCard("mistral-v0.1-awq",  "Mistral 7B v0.1 AWQ",    "Mistral AI", "Mistral", "@hf/thebloke/mistral-7b-instruct-v0.1-awq"),
+    # ─── IBM Granite ──────────────────────────────────────
+    ModelCard("granite-4.0", "Granite 4.0 H Micro", "IBM", "Granite", "@cf/ibm-granite/granite-4.0-h-micro"),
+    # ─── ZhipuAI GLM ─────────────────────────────────────
+    ModelCard("glm-4.7-flash", "GLM 4.7 Flash", "ZhipuAI", "GLM", "@cf/zai-org/glm-4.7-flash"),
+    # ─── AI Singapore ─────────────────────────────────────
+    ModelCard("sea-lion-27b", "SEA-LION v4 27B", "AI Singapore", "SEA-LION", "@cf/aisingapore/gemma-sea-lion-v4-27b-it"),
+    # ─── Community / Other ────────────────────────────────
+    ModelCard("hermes-2-pro",      "Hermes 2 Pro Mistral 7B",    "NousResearch",  "Hermes",      "@hf/nousresearch/hermes-2-pro-mistral-7b"),
+    ModelCard("openhermes-2.5",    "OpenHermes 2.5 Mistral 7B",  "NousResearch",  "Hermes",      "@hf/thebloke/openhermes-2.5-mistral-7b-awq"),
+    ModelCard("starling-7b",       "Starling LM 7B Beta",        "Nexusflow",     "Starling",    "@hf/nexusflow/starling-lm-7b-beta"),
+    ModelCard("neural-chat-7b",    "Neural Chat 7B v3.1",        "Intel",         "Neural Chat", "@hf/thebloke/neural-chat-7b-v3-1-awq"),
+    ModelCard("openchat-3.5",      "OpenChat 3.5",               "OpenChat",      "OpenChat",    "@cf/openchat/openchat-3.5-0106"),
+    ModelCard("cybertron-7b",      "UNA Cybertron 7B v2",        "fblgit",        "Cybertron",   "@cf/fblgit/una-cybertron-7b-v2-bf16"),
+    ModelCard("discolm-german-7b", "DiscoLM German 7B",          "TheBloke",      "DiscoLM",     "@cf/thebloke/discolm-german-7b-v1-awq"),
+    ModelCard("zephyr-7b",         "Zephyr 7B Beta",             "HuggingFace",   "Zephyr",      "@hf/thebloke/zephyr-7b-beta-awq"),
+    ModelCard("falcon-7b",         "Falcon 7B Instruct",         "TII UAE",       "Falcon",      "@cf/tiiuae/falcon-7b-instruct"),
+    ModelCard("tinyllama-1.1b",    "TinyLlama 1.1B Chat",        "TinyLlama",     "TinyLlama",   "@cf/tinyllama/tinyllama-1.1b-chat-v1.0"),
+    ModelCard("phi-2",             "Phi 2",                      "Microsoft",     "Phi",         "@cf/microsoft/phi-2"),
+    ModelCard("sqlcoder",          "SQLCoder 7B 2",              "Defog",         "SQLCoder",    "@cf/defog/sqlcoder-7b-2"),
+)
+# ═══════════════════════════════════════════════════════════
+# §4 — THINK-TAG STREAM PARSER
+#
+#  Detects  <think>…</think>  across chunked tokens.
+#  Yields  ("reasoning", text)  or  ("content", text).
+#  Handles tags split across multiple SSE tokens.
+# ═══════════════════════════════════════════════════════════
+class ThinkParser:
+    __slots__ = ("thinking", "buf")
+    OPEN  = "<think>"      # 7 chars
+    CLOSE = "</think>"     # 8 chars
+    def __init__(self):
+        self.thinking = False
+        self.buf      = ""
+    # ── feed one token, get classified fragments ──────
+    def feed(self, token: str) -> list[tuple[str, str]]:
+        self.buf += token
+        out: list[tuple[str, str]] = []
+        while self.buf:
+            tag  = self.CLOSE if self.thinking else self.OPEN
+            kind = "reasoning" if self.thinking else "content"
+            idx = self.buf.find(tag)
+            if idx >= 0:
+                # full tag found — emit text before, flip state
+                if idx > 0:
+                    out.append((kind, self.buf[:idx]))
+                self.buf = self.buf[idx + len(tag) :]
+                self.thinking = not self.thinking
+                continue
+            # no full tag — check for partial tag stuck at end
+            held = self._partial(tag)
+            if held:
+                safe = self.buf[: -len(held)]
+                if safe:
+                    out.append((kind, safe))
+                self.buf = held
+            else:
+                out.append((kind, self.buf))
+                self.buf = ""
+            break
+        return out
+    # ── drain remaining buffer at stream end ──────────
+    def flush(self) -> list[tuple[str, str]]:
+        if not self.buf:
+            return []
+        kind = "reasoning" if self.thinking else "content"
+        r = [(kind, self.buf)]
+        self.buf = ""
+        return r
+    # ── helper: longest suffix of buf that is a prefix of tag
+    def _partial(self, tag: str) -> str:
+        for i in range(min(len(tag) - 1, len(self.buf)), 0, -1):
+            if self.buf[-i:] == tag[:i]:
+                return self.buf[-i:]
+        return ""
+# ═══════════════════════════════════════════════════════════
+# §5 — BACKEND CLIENT
+#
+#  Talks to the Cloudflare AI proxy hosted on HF Spaces.
+#  Parses upstream SSE and yields raw string tokens.
+#  Retries on transient HTTP errors.
+# ═══════════════════════════════════════════════════════════
+_BACKEND     = "https://adarshu07-ls.hf.space"
+_BACKEND_URL = f"{_BACKEND}/v1/chat/completions"
+_RETRYABLE = frozenset({429, 500, 502, 503, 504, 520, 521, 522, 523, 524})
+_FATAL     = frozenset({400, 401, 403, 404, 405, 422})
+_BE_HEADERS = {
+    "Accept":          "application/json",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Content-Type":    "application/json",
+    "Origin":          _BACKEND,
+    "Referer":         f"{_BACKEND}/docs",
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/131.0.0.0 Safari/537.36"
+    ),
+}
+def _parse_sse(line: str) -> tuple[str, bool]:
+    """One SSE data: line  →  (token_text, is_done)."""
+    line = line.strip()
+    if not line.startswith("data:"):
+        return "", False
+    payload = line[5:].strip()
+    if payload == "[DONE]":
+        return "", True
+    try:
+        obj = json.loads(payload)
+        if "error" in obj:
+            return "", True
+        delta = obj.get("choices", [{}])[0].get("delta", {})
+        return delta.get("content", "") or "", False
+    except (json.JSONDecodeError, KeyError, IndexError):
+        return "", False
+async def backend_stream(
+    session:     aiohttp.ClientSession,
+    messages:    list[dict],
+    model_id:    str,
+    temperature: float = 0.7,
+    max_tokens:  int   = 4096,
+    timeout:     int   = 180,
+    retries:     int   = 2,
+) -> AsyncGenerator[str, None]:
+    """POST → upstream, parse SSE, yield raw tokens."""
+    body: dict = {
+        "model":       model_id,
+        "messages":    messages,
+        "stream":      True,
+        "temperature": temperature,
+    }
+    if max_tokens:
+        body["max_tokens"] = max_tokens
+    last_err = ""
+    for attempt in range(1 + retries):
+        try:
+            async with session.post(
+                _BACKEND_URL,
+                json=body,
+                timeout=aiohttp.ClientTimeout(
+                    total=timeout,
+                    sock_connect=30,
+                    sock_read=timeout,
+                ),
+            ) as resp:
+                if resp.status == 200:
+                    while True:
+                        raw = await resp.content.readline()
+                        if not raw:
+                            break
+                        line = raw.decode("utf-8", errors="replace")
+                        if not line.strip():
+                            continue
+                        tok, done = _parse_sse(line)
+                        if done:
+                            return
+                        if tok:
+                            yield tok
+                    return
+                text = await resp.text()
+                last_err = f"HTTP {resp.status}: {text[:300]}"
+                if resp.status in _FATAL:
+                    raise RuntimeError(last_err)
+                if resp.status in _RETRYABLE and attempt < retries:
+                    wait = min(2.0 * (attempt + 1) + random.random(), 15)
+                    log.warning(f"Retry {attempt+1}/{retries} in {wait:.1f}s — {last_err}")
+                    await asyncio.sleep(wait)
+                    continue
+                raise RuntimeError(last_err)
+        except (RuntimeError, GeneratorExit):
+            raise
+        except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as exc:
+            last_err = str(exc)
+            if attempt < retries:
+                log.warning(f"Retry {attempt+1}/{retries} — {last_err}")
+                await asyncio.sleep(1.5 * (attempt + 1))
+                continue
+            raise RuntimeError(f"Backend unreachable: {last_err}") from exc
+    raise RuntimeError(f"All retries exhausted: {last_err}")
+# ═══════════════════════════════════════════════════════════
+# §6 — FASTAPI APP  +  LIFESPAN
+# ═══════════════════════════════════════════════════════════
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # ── startup ───────────────────────────────────────
+    connector = aiohttp.TCPConnector(
+        resolver=aiohttp.resolver.ThreadedResolver(),
+        limit=100,
+        limit_per_host=15,
+        ttl_dns_cache=300,
+        keepalive_timeout=60,
+        enable_cleanup_closed=True,
+    )
+    app.state.http = aiohttp.ClientSession(
+        connector=connector,
+        headers=_BE_HEADERS,
+    )
+    log.info("══════════════════════════════════════════")
+    log.info("  ⚡  DevsDo API Server  v1.0.0")
+    log.info(f"  Models : {len(Registry.all_cards())}")
+    log.info(f"  Backend: {_BACKEND}")
+    log.info(f"  Port   : 7860")
+    log.info("══════════════════════════════════════════")
+    yield
+    # ── shutdown ──────────────────────────────────────
+    await app.state.http.close()
+    log.info("Server stopped ✓")
+app = FastAPI(
+    title="⚡ DevsDo API",
+    description="OpenAI-compatible · 52 Models · Streaming · Reasoning",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ═══════════════════════════════════════════════════════════
+# §7 — PYDANTIC SCHEMAS
+# ═══════════════════════════════════════════════════════════
+class Message(BaseModel):
+    role:    str
+    content: str
+class ChatRequest(BaseModel):
+    model:       str            = "kimi-k2.5"
+    messages:    list[Message]  = Field(..., min_length=1)
+    stream:      bool           = False
+    temperature: float          = Field(default=0.7, ge=0.0, le=2.0)
+    max_tokens:  Optional[int]  = Field(default=4096, ge=1)
+# ═══════════════════════════════════════════════════════════
+# §8 — ROUTES
+# ═══════════════════════════════════════════════════════════
+def _cid() -> str:
+    """Generate a chat-completion ID."""
+    return f"chatcmpl-{uuid.uuid4().hex[:29]}"
+def _sse(obj: Any) -> str:
+    """Format one SSE frame."""
+    return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n"
+# ── info ──────────────────────────────────────────────────
+@app.get("/")
+async def root():
+    return {
+        "service": "⚡ DevsDo API",
+        "version": "1.0.0",
+        "status":  "running",
+        "models":  len(Registry.all_cards()),
+        "docs":    "/docs",
+        "endpoints": {
+            "health":         "GET  /health",
+            "models_openai":  "GET  /v1/models",
+            "models_detail":  "GET  /api/internal/v1/models",
+            "chat":           "POST /v1/chat/completions",
+        },
+    }
+@app.get("/health")
+async def health():
+    return {
+        "status":    "healthy",
+        "timestamp": int(time.time()),
+        "models":    len(Registry.all_cards()),
+        "backend":   _BACKEND,
+    }
+# ── models ────────────────────────────────────────────────
+@app.get("/v1/models")
+async def models_openai():
+    """OpenAI-compatible model list."""
+    return Registry.openai_list()
+@app.get("/api/internal/v1/models")
+async def models_internal():
+    """Rich model registry grouped by family."""
+    return Registry.internal_list()
+# ── chat completions ─────────────────────────────────────
+@app.post("/v1/chat/completions")
+async def chat_completions(req: ChatRequest):
+    """
+    OpenAI-compatible chat completions.
+    • stream=false  →  JSON  (reasoning in `reasoning_content`)
+    • stream=true   →  SSE   (reasoning chunks use `reasoning_content` in delta)
+    """
+    model_id = Registry.resolve(req.model)
+    card     = Registry.find(req.model)
+    display  = card.name if card else req.model
+    msgs = [{"role": m.role, "content": m.content} for m in req.messages]
+    if req.stream:
+        return StreamingResponse(
+            _stream_gen(app.state.http, msgs, model_id, display,
+                        req.temperature, req.max_tokens or 4096),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control":      "no-cache",
+                "Connection":         "keep-alive",
+                "X-Accel-Buffering":  "no",
+            },
+        )
+    return await _complete(
+        app.state.http, msgs, model_id, display,
+        req.temperature, req.max_tokens or 4096,
+    )
+# ═══════════════════════════════════════════════════════════
+# §9 — SSE STREAM GENERATOR
+#
+#  backend tokens  →  ThinkParser  →  OpenAI SSE chunks
+#
+#  Reasoning tokens go into  delta.reasoning_content
+#  Normal   tokens go into  delta.content
+# ═══════════════════════════════════════════════════════════
+async def _stream_gen(
+    session:     aiohttp.ClientSession,
+    messages:    list[dict],
+    model_id:    str,
+    model_name:  str,
+    temperature: float,
+    max_tokens:  int,
+) -> AsyncGenerator[str, None]:
+    cid    = _cid()
+    ts     = int(time.time())
+    parser = ThinkParser()
+    def _chunk(delta: dict, finish: Optional[str] = None) -> str:
+        return _sse({
+            "id":      cid,
+            "object":  "chat.completion.chunk",
+            "created": ts,
+            "model":   model_name,
+            "choices": [{
+                "index":         0,
+                "delta":         delta,
+                "finish_reason": finish,
+            }],
+        })
+    # ── role announcement ─────────────────────────────
+    yield _chunk({"role": "assistant"})
+    try:
+        async for token in backend_stream(
+            session, messages, model_id, temperature, max_tokens,
+        ):
+            for kind, text in parser.feed(token):
+                if kind == "reasoning":
+                    yield _chunk({"reasoning_content": text})
+                else:
+                    yield _chunk({"content": text})
+        # ── flush parser buffer ───────────────────────
+        for kind, text in parser.flush():
+            if kind == "reasoning":
+                yield _chunk({"reasoning_content": text})
+            else:
+                yield _chunk({"content": text})
+        # ── stop ──────────────────────────────────────
+        yield _chunk({}, finish="stop")
+        yield "data: [DONE]\n\n"
+    except Exception as exc:
+        log.error(f"Stream error [{model_name}]: {exc}")
+        yield _chunk({"content": f"\n\n[Error: {exc}]"}, finish="error")
+        yield "data: [DONE]\n\n"
+# ═══════════════════════════════════════════════════════════
+# §10 — NON-STREAMING COLLECTOR
+# ═══════════════════════════════════════════════════════════
+async def _complete(
+    session:     aiohttp.ClientSession,
+    messages:    list[dict],
+    model_id:    str,
+    model_name:  str,
+    temperature: float,
+    max_tokens:  int,
+) -> dict:
+    """Collect full response, separate reasoning vs content."""
+    parser    = ThinkParser()
+    reasoning: list[str] = []
+    content:   list[str] = []
+    try:
+        async for token in backend_stream(
+            session, messages, model_id, temperature, max_tokens,
+        ):
+            for kind, text in parser.feed(token):
+                (reasoning if kind == "reasoning" else content).append(text)
+        for kind, text in parser.flush():
+            (reasoning if kind == "reasoning" else content).append(text)
+    except Exception as exc:
+        raise HTTPException(status_code=502, detail=f"Backend error: {exc}")
+    msg: dict = {
+        "role":    "assistant",
+        "content": "".join(content),
+    }
+    if reasoning:
+        msg["reasoning_content"] = "".join(reasoning)
+    total_chars = len(msg["content"]) + len(msg.get("reasoning_content", ""))
+    return {
+        "id":      _cid(),
+        "object":  "chat.completion",
+        "created": int(time.time()),
+        "model":   model_name,
+        "choices": [{
+            "index":         0,
+            "message":       msg,
+            "finish_reason": "stop",
+        }],
+        "usage": {
+            "prompt_tokens":     0,
+            "completion_tokens": total_chars // 4,   # rough estimate
+            "total_tokens":      total_chars // 4,
+        },
+    }
+# ═══════════════════════════════════════════════════════════
+# §11 — ENTRYPOINT
+# ═══════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=7860,
+        workers=1,
+        timeout_keep_alive=120,
+        log_level="info",
+    )