Spaces:

Adarshu07
/

Sec

Running

File size: 31,974 Bytes

"""
══════════════════════════════════════════════════════════════════
  ⚡  DevsDo API Server  v1.0.0

  OpenAI-compatible · 52 Models · Cloudflare AI Backend
  SSE Streaming  ·  <think> Reasoning  ·  Zero API Keys

  Sections
  ────────
   §1  Logging
   §2  Model Registry  (g4f-style)
   §3  Register All 52 Models
   §4  Think-Tag Stream Parser
   §5  Backend Client  (SSE → raw tokens)
   §6  FastAPI App + Lifespan
   §7  Pydantic Schemas
   §8  Routes
   §9  Stream Generator  (tokens → OpenAI SSE)
   §10 Non-Stream Collector
   §11 Entrypoint
══════════════════════════════════════════════════════════════════
"""

from __future__ import annotations

import json, time, uuid, asyncio, random, logging
from contextlib import asynccontextmanager
from dataclasses import dataclass, asdict
from typing import Optional, AsyncGenerator, Dict, List, Any

import aiohttp
import aiohttp.resolver
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field


# ═══════════════════════════════════════════════════════════
# §1 — LOGGING
# ═══════════════════════════════════════════════════════════

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s │ %(levelname)-7s │ %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger("devsdo")


# ═══════════════════════════════════════════════════════════
# §2 — MODEL REGISTRY  (g4f-style)
#
#  Each model carries:
#    name       – short route alias     ("deepseek-r1")
#    real_name  – human display name    ("DeepSeek R1 Distill Qwen 32B")
#    author     – organisation          ("DeepSeek")
#    family     – model family group    ("DeepSeek")
#    model_id   – backend @cf/@hf ID   ("@cf/deepseek-ai/…")
# ═══════════════════════════════════════════════════════════

@dataclass(frozen=True, slots=True)
class ModelCard:
    name:      str
    real_name: str
    author:    str
    family:    str
    model_id:  str


class Registry:
    """Central model store — register once, resolve anywhere."""

    _by_name: Dict[str, ModelCard] = {}
    _by_id:   Dict[str, ModelCard] = {}
    _default: str = ""

    # ── mutators ──────────────────────────────────────
    @classmethod
    def add(cls, *cards: ModelCard):
        for c in cards:
            cls._by_name[c.name] = c
            cls._by_id[c.model_id] = c
            if not cls._default:
                cls._default = c.name

    # ── lookups ───────────────────────────────────────
    @classmethod
    def resolve(cls, raw: Optional[str]) -> str:
        """Alias / full-id / fuzzy  →  backend model_id."""
        if not raw:
            return cls._by_name[cls._default].model_id
        raw = raw.strip()
        for pfx in ("devsdo/", "devsdo:", "cloudflare/", "cf/"):
            if raw.lower().startswith(pfx):
                raw = raw[len(pfx):]
                break
        if raw.startswith(("@cf/", "@hf/")):
            return raw
        if raw in cls._by_name:
            return cls._by_name[raw].model_id
        low = raw.lower()
        for alias, card in cls._by_name.items():
            if low in alias or low in card.model_id.lower():
                return card.model_id
        return raw                      # pass-through

    @classmethod
    def find(cls, raw: str) -> Optional[ModelCard]:
        mid = cls.resolve(raw)
        return cls._by_id.get(mid) or cls._by_name.get(raw)

    @classmethod
    def all_cards(cls) -> List[ModelCard]:
        return list(cls._by_name.values())

    # ── serialisers ───────────────────────────────────
    @classmethod
    def openai_list(cls) -> dict:
        """GET /v1/models — OpenAI-compatible."""
        return {
            "object": "list",
            "data": [
                {
                    "id":       c.name,
                    "object":   "model",
                    "created":  1700000000,
                    "owned_by": c.author.lower().replace(" ", "-"),
                }
                for c in cls._by_name.values()
            ],
        }

    @classmethod
    def internal_list(cls) -> dict:
        """GET /api/internal/v1/models — rich, grouped by family."""
        fam: Dict[str, list] = {}
        for c in cls._by_name.values():
            fam.setdefault(c.family, []).append(
                {
                    "id":         c.name,
                    "name":       c.real_name,
                    "author":     c.author,
                    "backend_id": c.model_id,
                }
            )
        return {
            "server":    "DevsDo API",
            "version":   "1.0.0",
            "timestamp": int(time.time()),
            "total":     len(cls._by_name),
            "families": [
                {"family": fn, "count": len(ms), "models": ms}
                for fn, ms in fam.items()
            ],
        }


# ═══════════════════════════════════════════════════════════
# §3 — REGISTER ALL 52 MODELS
# ═══════════════════════════════════════════════════════════

Registry.add(
    # ─── Flagship / Large ─────────────────────────────────
    ModelCard("kimi-k2.5",        "Kimi K2.5",                   "Moonshot AI",   "Kimi",      "@cf/moonshotai/kimi-k2.5"),
    ModelCard("nemotron-120b",    "Nemotron 3 120B A12B",        "NVIDIA",        "Nemotron",  "@cf/nvidia/nemotron-3-120b-a12b"),
    ModelCard("gpt-oss-120b",     "GPT-OSS 120B",                "OpenAI",        "GPT-OSS",   "@cf/openai/gpt-oss-120b"),
    ModelCard("gpt-oss-20b",      "GPT-OSS 20B",                 "OpenAI",        "GPT-OSS",   "@cf/openai/gpt-oss-20b"),
    ModelCard("llama-3.3-70b",    "LLaMA 3.3 70B Instruct FP8",  "Meta",         "LLaMA",     "@cf/meta/llama-3.3-70b-instruct-fp8-fast"),

    # ─── Meta LLaMA ───────────────────────────────────────
    ModelCard("llama-4-scout",       "LLaMA 4 Scout 17B 16E",      "Meta", "LLaMA", "@cf/meta/llama-4-scout-17b-16e-instruct"),
    ModelCard("llama-3.2-11b-vision","LLaMA 3.2 11B Vision",       "Meta", "LLaMA", "@cf/meta/llama-3.2-11b-vision-instruct"),
    ModelCard("llama-3.1-8b",        "LLaMA 3.1 8B Fast",          "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fast"),
    ModelCard("llama-3.1-8b-fp8",    "LLaMA 3.1 8B FP8",           "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fp8"),
    ModelCard("llama-3.1-8b-awq",    "LLaMA 3.1 8B AWQ",           "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-awq"),
    ModelCard("llama-3.2-3b",        "LLaMA 3.2 3B",               "Meta", "LLaMA", "@cf/meta/llama-3.2-3b-instruct"),
    ModelCard("llama-3.2-1b",        "LLaMA 3.2 1B",               "Meta", "LLaMA", "@cf/meta/llama-3.2-1b-instruct"),
    ModelCard("llama-3-8b",          "LLaMA 3 8B",                  "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct"),
    ModelCard("llama-3-8b-awq",      "LLaMA 3 8B AWQ",             "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct-awq"),
    ModelCard("llama-guard-3",       "LLaMA Guard 3 8B",            "Meta", "LLaMA", "@cf/meta/llama-guard-3-8b"),
    ModelCard("llama-2-7b-fp16",     "LLaMA 2 7B FP16",            "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-fp16"),
    ModelCard("llama-2-7b-int8",     "LLaMA 2 7B INT8",            "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-int8"),
    ModelCard("llama-2-7b-lora",     "LLaMA 2 7B LoRA",            "Meta", "LLaMA", "@cf/meta-llama/llama-2-7b-chat-hf-lora"),
    ModelCard("llama-2-13b",         "LLaMA 2 13B AWQ",            "Meta", "LLaMA", "@hf/thebloke/llama-2-13b-chat-awq"),

    # ─── Qwen ─────────────────────────────────────────────
    ModelCard("qwq-32b",          "QwQ 32B",                "Qwen", "Qwen", "@cf/qwen/qwq-32b"),
    ModelCard("qwen-coder-32b",   "Qwen 2.5 Coder 32B",    "Qwen", "Qwen", "@cf/qwen/qwen2.5-coder-32b-instruct"),
    ModelCard("qwen3-30b",        "Qwen 3 30B A3B FP8",    "Qwen", "Qwen", "@cf/qwen/qwen3-30b-a3b-fp8"),
    ModelCard("qwen1.5-14b",      "Qwen 1.5 14B AWQ",      "Qwen", "Qwen", "@cf/qwen/qwen1.5-14b-chat-awq"),
    ModelCard("qwen1.5-7b",       "Qwen 1.5 7B AWQ",       "Qwen", "Qwen", "@cf/qwen/qwen1.5-7b-chat-awq"),
    ModelCard("qwen1.5-1.8b",     "Qwen 1.5 1.8B",         "Qwen", "Qwen", "@cf/qwen/qwen1.5-1.8b-chat"),
    ModelCard("qwen1.5-0.5b",     "Qwen 1.5 0.5B",         "Qwen", "Qwen", "@cf/qwen/qwen1.5-0.5b-chat"),

    # ─── DeepSeek ─────────────────────────────────────────
    ModelCard("deepseek-r1",          "DeepSeek R1 Distill Qwen 32B",   "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b"),
    ModelCard("deepseek-math",        "DeepSeek Math 7B",               "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-math-7b-instruct"),
    ModelCard("deepseek-coder-base",  "DeepSeek Coder 6.7B Base",       "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-base-awq"),
    ModelCard("deepseek-coder",       "DeepSeek Coder 6.7B Instruct",   "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-instruct-awq"),

    # ─── Google Gemma ─────────────────────────────────────
    ModelCard("gemma-3-12b",    "Gemma 3 12B IT",       "Google", "Gemma", "@cf/google/gemma-3-12b-it"),
    ModelCard("gemma-7b",       "Gemma 7B IT",          "Google", "Gemma", "@hf/google/gemma-7b-it"),
    ModelCard("gemma-2b-lora",  "Gemma 2B IT LoRA",     "Google", "Gemma", "@cf/google/gemma-2b-it-lora"),
    ModelCard("gemma-7b-lora",  "Gemma 7B IT LoRA",     "Google", "Gemma", "@cf/google/gemma-7b-it-lora"),

    # ─── Mistral ──────────────────────────────────────────
    ModelCard("mistral-small-3.1", "Mistral Small 3.1 24B",  "Mistral AI", "Mistral", "@cf/mistralai/mistral-small-3.1-24b-instruct"),
    ModelCard("mistral-v0.2",      "Mistral 7B v0.2",        "Mistral AI", "Mistral", "@hf/mistral/mistral-7b-instruct-v0.2"),
    ModelCard("mistral-v0.2-lora", "Mistral 7B v0.2 LoRA",   "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.2-lora"),
    ModelCard("mistral-v0.1",      "Mistral 7B v0.1",        "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.1"),
    ModelCard("mistral-v0.1-awq",  "Mistral 7B v0.1 AWQ",    "Mistral AI", "Mistral", "@hf/thebloke/mistral-7b-instruct-v0.1-awq"),

    # ─── IBM Granite ──────────────────────────────────────
    ModelCard("granite-4.0", "Granite 4.0 H Micro", "IBM", "Granite", "@cf/ibm-granite/granite-4.0-h-micro"),

    # ─── ZhipuAI GLM ─────────────────────────────────────
    ModelCard("glm-4.7-flash", "GLM 4.7 Flash", "ZhipuAI", "GLM", "@cf/zai-org/glm-4.7-flash"),

    # ─── AI Singapore ─────────────────────────────────────
    ModelCard("sea-lion-27b", "SEA-LION v4 27B", "AI Singapore", "SEA-LION", "@cf/aisingapore/gemma-sea-lion-v4-27b-it"),

    # ─── Community / Other ────────────────────────────────
    ModelCard("hermes-2-pro",      "Hermes 2 Pro Mistral 7B",    "NousResearch",  "Hermes",      "@hf/nousresearch/hermes-2-pro-mistral-7b"),
    ModelCard("openhermes-2.5",    "OpenHermes 2.5 Mistral 7B",  "NousResearch",  "Hermes",      "@hf/thebloke/openhermes-2.5-mistral-7b-awq"),
    ModelCard("starling-7b",       "Starling LM 7B Beta",        "Nexusflow",     "Starling",    "@hf/nexusflow/starling-lm-7b-beta"),
    ModelCard("neural-chat-7b",    "Neural Chat 7B v3.1",        "Intel",         "Neural Chat", "@hf/thebloke/neural-chat-7b-v3-1-awq"),
    ModelCard("openchat-3.5",      "OpenChat 3.5",               "OpenChat",      "OpenChat",    "@cf/openchat/openchat-3.5-0106"),
    ModelCard("cybertron-7b",      "UNA Cybertron 7B v2",        "fblgit",        "Cybertron",   "@cf/fblgit/una-cybertron-7b-v2-bf16"),
    ModelCard("discolm-german-7b", "DiscoLM German 7B",          "TheBloke",      "DiscoLM",     "@cf/thebloke/discolm-german-7b-v1-awq"),
    ModelCard("zephyr-7b",         "Zephyr 7B Beta",             "HuggingFace",   "Zephyr",      "@hf/thebloke/zephyr-7b-beta-awq"),
    ModelCard("falcon-7b",         "Falcon 7B Instruct",         "TII UAE",       "Falcon",      "@cf/tiiuae/falcon-7b-instruct"),
    ModelCard("tinyllama-1.1b",    "TinyLlama 1.1B Chat",        "TinyLlama",     "TinyLlama",   "@cf/tinyllama/tinyllama-1.1b-chat-v1.0"),
    ModelCard("phi-2",             "Phi 2",                      "Microsoft",     "Phi",         "@cf/microsoft/phi-2"),
    ModelCard("sqlcoder",          "SQLCoder 7B 2",              "Defog",         "SQLCoder",    "@cf/defog/sqlcoder-7b-2"),
)


# ═══════════════════════════════════════════════════════════
# §4 — THINK-TAG STREAM PARSER
#
#  Detects  <think>…</think>  across chunked tokens.
#  Yields  ("reasoning", text)  or  ("content", text).
#  Handles tags split across multiple SSE tokens.
# ═══════════════════════════════════════════════════════════

class ThinkParser:
    __slots__ = ("thinking", "buf")

    OPEN  = "<think>"      # 7 chars
    CLOSE = "</think>"     # 8 chars

    def __init__(self):
        self.thinking = False
        self.buf      = ""

    # ── feed one token, get classified fragments ──────
    def feed(self, token: str) -> list[tuple[str, str]]:
        self.buf += token
        out: list[tuple[str, str]] = []

        while self.buf:
            tag  = self.CLOSE if self.thinking else self.OPEN
            kind = "reasoning" if self.thinking else "content"

            idx = self.buf.find(tag)
            if idx >= 0:
                # full tag found — emit text before, flip state
                if idx > 0:
                    out.append((kind, self.buf[:idx]))
                self.buf = self.buf[idx + len(tag) :]
                self.thinking = not self.thinking
                continue

            # no full tag — check for partial tag stuck at end
            held = self._partial(tag)
            if held:
                safe = self.buf[: -len(held)]
                if safe:
                    out.append((kind, safe))
                self.buf = held
            else:
                out.append((kind, self.buf))
                self.buf = ""
            break

        return out

    # ── drain remaining buffer at stream end ──────────
    def flush(self) -> list[tuple[str, str]]:
        if not self.buf:
            return []
        kind = "reasoning" if self.thinking else "content"
        r = [(kind, self.buf)]
        self.buf = ""
        return r

    # ── helper: longest suffix of buf that is a prefix of tag
    def _partial(self, tag: str) -> str:
        for i in range(min(len(tag) - 1, len(self.buf)), 0, -1):
            if self.buf[-i:] == tag[:i]:
                return self.buf[-i:]
        return ""


# ═══════════════════════════════════════════════════════════
# §5 — BACKEND CLIENT
#
#  Talks to the Cloudflare AI proxy hosted on HF Spaces.
#  Parses upstream SSE and yields raw string tokens.
#  Retries on transient HTTP errors.
# ═══════════════════════════════════════════════════════════

_BACKEND     = "https://adarshu07-ls.hf.space"
_BACKEND_URL = f"{_BACKEND}/v1/chat/completions"

_RETRYABLE = frozenset({429, 500, 502, 503, 504, 520, 521, 522, 523, 524})
_FATAL     = frozenset({400, 401, 403, 404, 405, 422})

_BE_HEADERS = {
    "Accept":          "application/json",
    "Accept-Encoding": "gzip, deflate, br",
    "Content-Type":    "application/json",
    "Origin":          _BACKEND,
    "Referer":         f"{_BACKEND}/docs",
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/131.0.0.0 Safari/537.36"
    ),
}


def _parse_sse(line: str) -> tuple[str, bool]:
    """One SSE data: line  →  (token_text, is_done)."""
    line = line.strip()
    if not line.startswith("data:"):
        return "", False
    payload = line[5:].strip()
    if payload == "[DONE]":
        return "", True
    try:
        obj = json.loads(payload)
        if "error" in obj:
            return "", True
        delta = obj.get("choices", [{}])[0].get("delta", {})
        return delta.get("content", "") or "", False
    except (json.JSONDecodeError, KeyError, IndexError):
        return "", False


async def backend_stream(
    session:     aiohttp.ClientSession,
    messages:    list[dict],
    model_id:    str,
    temperature: float = 0.7,
    max_tokens:  int   = 4096,
    timeout:     int   = 180,
    retries:     int   = 2,
) -> AsyncGenerator[str, None]:
    """POST → upstream, parse SSE, yield raw tokens."""

    body: dict = {
        "model":       model_id,
        "messages":    messages,
        "stream":      True,
        "temperature": temperature,
    }
    if max_tokens:
        body["max_tokens"] = max_tokens

    last_err = ""

    for attempt in range(1 + retries):
        try:
            async with session.post(
                _BACKEND_URL,
                json=body,
                timeout=aiohttp.ClientTimeout(
                    total=timeout,
                    sock_connect=30,
                    sock_read=timeout,
                ),
            ) as resp:

                if resp.status == 200:
                    while True:
                        raw = await resp.content.readline()
                        if not raw:
                            break
                        line = raw.decode("utf-8", errors="replace")
                        if not line.strip():
                            continue
                        tok, done = _parse_sse(line)
                        if done:
                            return
                        if tok:
                            yield tok
                    return

                text = await resp.text()
                last_err = f"HTTP {resp.status}: {text[:300]}"

                if resp.status in _FATAL:
                    raise RuntimeError(last_err)
                if resp.status in _RETRYABLE and attempt < retries:
                    wait = min(2.0 * (attempt + 1) + random.random(), 15)
                    log.warning(f"Retry {attempt+1}/{retries} in {wait:.1f}s — {last_err}")
                    await asyncio.sleep(wait)
                    continue
                raise RuntimeError(last_err)

        except (RuntimeError, GeneratorExit):
            raise
        except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as exc:
            last_err = str(exc)
            if attempt < retries:
                log.warning(f"Retry {attempt+1}/{retries} — {last_err}")
                await asyncio.sleep(1.5 * (attempt + 1))
                continue
            raise RuntimeError(f"Backend unreachable: {last_err}") from exc

    raise RuntimeError(f"All retries exhausted: {last_err}")


# ═══════════════════════════════════════════════════════════
# §6 — FASTAPI APP  +  LIFESPAN
# ═══════════════════════════════════════════════════════════

@asynccontextmanager
async def lifespan(app: FastAPI):
    # ── startup ───────────────────────────────────────
    connector = aiohttp.TCPConnector(
        resolver=aiohttp.resolver.ThreadedResolver(),
        limit=100,
        limit_per_host=15,
        ttl_dns_cache=300,
        keepalive_timeout=60,
        enable_cleanup_closed=True,
    )
    app.state.http = aiohttp.ClientSession(
        connector=connector,
        headers=_BE_HEADERS,
    )
    log.info("══════════════════════════════════════════")
    log.info("  ⚡  DevsDo API Server  v1.0.0")
    log.info(f"  Models : {len(Registry.all_cards())}")
    log.info(f"  Backend: {_BACKEND}")
    log.info(f"  Port   : 7860")
    log.info("══════════════════════════════════════════")
    yield
    # ── shutdown ──────────────────────────────────────
    await app.state.http.close()
    log.info("Server stopped ✓")


app = FastAPI(
    title="⚡ DevsDo API",
    description="OpenAI-compatible · 52 Models · Streaming · Reasoning",
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc",
    lifespan=lifespan,
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


# ═══════════════════════════════════════════════════════════
# §7 — PYDANTIC SCHEMAS
# ═══════════════════════════════════════════════════════════

class Message(BaseModel):
    role:    str
    content: str

class ChatRequest(BaseModel):
    model:       str            = "kimi-k2.5"
    messages:    list[Message]  = Field(..., min_length=1)
    stream:      bool           = False
    temperature: float          = Field(default=0.7, ge=0.0, le=2.0)
    max_tokens:  Optional[int]  = Field(default=4096, ge=1)


# ═══════════════════════════════════════════════════════════
# §8 — ROUTES
# ═══════════════════════════════════════════════════════════

def _cid() -> str:
    """Generate a chat-completion ID."""
    return f"chatcmpl-{uuid.uuid4().hex[:29]}"

def _sse(obj: Any) -> str:
    """Format one SSE frame."""
    return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n"


# ── info ──────────────────────────────────────────────────

@app.get("/")
async def root():
    return {
        "service": "⚡ DevsDo API",
        "version": "1.0.0",
        "status":  "running",
        "models":  len(Registry.all_cards()),
        "docs":    "/docs",
        "endpoints": {
            "health":         "GET  /health",
            "models_openai":  "GET  /v1/models",
            "models_detail":  "GET  /api/internal/v1/models",
            "chat":           "POST /v1/chat/completions",
        },
    }


@app.get("/health")
async def health():
    return {
        "status":    "healthy",
        "timestamp": int(time.time()),
        "models":    len(Registry.all_cards()),
        "backend":   _BACKEND,
    }


# ── models ────────────────────────────────────────────────

@app.get("/v1/models")
async def models_openai():
    """OpenAI-compatible model list."""
    return Registry.openai_list()


@app.get("/api/internal/v1/models")
async def models_internal():
    """Rich model registry grouped by family."""
    return Registry.internal_list()


# ── chat completions ─────────────────────────────────────

@app.post("/v1/chat/completions")
async def chat_completions(req: ChatRequest):
    """
    OpenAI-compatible chat completions.

    • stream=false  →  JSON  (reasoning in `reasoning_content`)
    • stream=true   →  SSE   (reasoning chunks use `reasoning_content` in delta)
    """
    model_id = Registry.resolve(req.model)
    card     = Registry.find(req.model)
    display  = card.name if card else req.model

    msgs = [{"role": m.role, "content": m.content} for m in req.messages]

    if req.stream:
        return StreamingResponse(
            _stream_gen(app.state.http, msgs, model_id, display,
                        req.temperature, req.max_tokens or 4096),
            media_type="text/event-stream",
            headers={
                "Cache-Control":      "no-cache",
                "Connection":         "keep-alive",
                "X-Accel-Buffering":  "no",
            },
        )

    return await _complete(
        app.state.http, msgs, model_id, display,
        req.temperature, req.max_tokens or 4096,
    )


# ═══════════════════════════════════════════════════════════
# §9 — SSE STREAM GENERATOR
#
#  backend tokens  →  ThinkParser  →  OpenAI SSE chunks
#
#  Reasoning tokens go into  delta.reasoning_content
#  Normal   tokens go into  delta.content
# ═══════════════════════════════════════════════════════════

async def _stream_gen(
    session:     aiohttp.ClientSession,
    messages:    list[dict],
    model_id:    str,
    model_name:  str,
    temperature: float,
    max_tokens:  int,
) -> AsyncGenerator[str, None]:

    cid    = _cid()
    ts     = int(time.time())
    parser = ThinkParser()

    def _chunk(delta: dict, finish: Optional[str] = None) -> str:
        return _sse({
            "id":      cid,
            "object":  "chat.completion.chunk",
            "created": ts,
            "model":   model_name,
            "choices": [{
                "index":         0,
                "delta":         delta,
                "finish_reason": finish,
            }],
        })

    # ── role announcement ─────────────────────────────
    yield _chunk({"role": "assistant"})

    try:
        async for token in backend_stream(
            session, messages, model_id, temperature, max_tokens,
        ):
            for kind, text in parser.feed(token):
                if kind == "reasoning":
                    yield _chunk({"reasoning": text})
                else:
                    yield _chunk({"content": text})

        # ── flush parser buffer ───────────────────────
        for kind, text in parser.flush():
            if kind == "reasoning":
                yield _chunk({"reasoning": text})
            else:
                yield _chunk({"content": text})

        # ── stop ──────────────────────────────────────
        yield _chunk({}, finish="stop")
        yield "data: [DONE]\n\n"

    except Exception as exc:
        log.error(f"Stream error [{model_name}]: {exc}")
        yield _chunk({"content": f"\n\n[Error: {exc}]"}, finish="error")
        yield "data: [DONE]\n\n"


# ═══════════════════════════════════════════════════════════
# §10 — NON-STREAMING COLLECTOR
# ═══════════════════════════════════════════════════════════

async def _complete(
    session:     aiohttp.ClientSession,
    messages:    list[dict],
    model_id:    str,
    model_name:  str,
    temperature: float,
    max_tokens:  int,
) -> dict:
    """Collect full response, separate reasoning vs content."""

    parser    = ThinkParser()
    reasoning: list[str] = []
    content:   list[str] = []

    try:
        async for token in backend_stream(
            session, messages, model_id, temperature, max_tokens,
        ):
            for kind, text in parser.feed(token):
                (reasoning if kind == "reasoning" else content).append(text)

        for kind, text in parser.flush():
            (reasoning if kind == "reasoning" else content).append(text)

    except Exception as exc:
        raise HTTPException(status_code=502, detail=f"Backend error: {exc}")

    msg: dict = {
        "role":    "assistant",
        "content": "".join(content),
    }
    if reasoning:
        msg["reasoning"] = "".join(reasoning)

    total_chars = len(msg["content"]) + len(msg.get("reasoning", ""))

    return {
        "id":      _cid(),
        "object":  "chat.completion",
        "created": int(time.time()),
        "model":   model_name,
        "choices": [{
            "index":         0,
            "message":       msg,
            "finish_reason": "stop",
        }],
        "usage": {
            "prompt_tokens":     0,
            "completion_tokens": total_chars // 4,   # rough estimate
            "total_tokens":      total_chars // 4,
        },
    }


# ═══════════════════════════════════════════════════════════
# §11 — ENTRYPOINT
# ═══════════════════════════════════════════════════════════

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
        "app:app",
        host="0.0.0.0",
        port=7860,
        workers=1,
        timeout_keep_alive=120,
        log_level="info",
    )