Sec / app.py
Adarshu07's picture
Update app.py
bba37f3 verified
"""
══════════════════════════════════════════════════════════════════
⚡ DevsDo API Server v1.0.0
OpenAI-compatible · 52 Models · Cloudflare AI Backend
SSE Streaming · <think> Reasoning · Zero API Keys
Sections
────────
§1 Logging
§2 Model Registry (g4f-style)
§3 Register All 52 Models
§4 Think-Tag Stream Parser
§5 Backend Client (SSE → raw tokens)
§6 FastAPI App + Lifespan
§7 Pydantic Schemas
§8 Routes
§9 Stream Generator (tokens → OpenAI SSE)
§10 Non-Stream Collector
§11 Entrypoint
══════════════════════════════════════════════════════════════════
"""
from __future__ import annotations
import json, time, uuid, asyncio, random, logging
from contextlib import asynccontextmanager
from dataclasses import dataclass, asdict
from typing import Optional, AsyncGenerator, Dict, List, Any
import aiohttp
import aiohttp.resolver
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
# ═══════════════════════════════════════════════════════════
# §1 — LOGGING
# ═══════════════════════════════════════════════════════════
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s │ %(levelname)-7s │ %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger("devsdo")
# ═══════════════════════════════════════════════════════════
# §2 — MODEL REGISTRY (g4f-style)
#
# Each model carries:
# name – short route alias ("deepseek-r1")
# real_name – human display name ("DeepSeek R1 Distill Qwen 32B")
# author – organisation ("DeepSeek")
# family – model family group ("DeepSeek")
# model_id – backend @cf/@hf ID ("@cf/deepseek-ai/…")
# ═══════════════════════════════════════════════════════════
@dataclass(frozen=True, slots=True)
class ModelCard:
name: str
real_name: str
author: str
family: str
model_id: str
class Registry:
"""Central model store — register once, resolve anywhere."""
_by_name: Dict[str, ModelCard] = {}
_by_id: Dict[str, ModelCard] = {}
_default: str = ""
# ── mutators ──────────────────────────────────────
@classmethod
def add(cls, *cards: ModelCard):
for c in cards:
cls._by_name[c.name] = c
cls._by_id[c.model_id] = c
if not cls._default:
cls._default = c.name
# ── lookups ───────────────────────────────────────
@classmethod
def resolve(cls, raw: Optional[str]) -> str:
"""Alias / full-id / fuzzy → backend model_id."""
if not raw:
return cls._by_name[cls._default].model_id
raw = raw.strip()
for pfx in ("devsdo/", "devsdo:", "cloudflare/", "cf/"):
if raw.lower().startswith(pfx):
raw = raw[len(pfx):]
break
if raw.startswith(("@cf/", "@hf/")):
return raw
if raw in cls._by_name:
return cls._by_name[raw].model_id
low = raw.lower()
for alias, card in cls._by_name.items():
if low in alias or low in card.model_id.lower():
return card.model_id
return raw # pass-through
@classmethod
def find(cls, raw: str) -> Optional[ModelCard]:
mid = cls.resolve(raw)
return cls._by_id.get(mid) or cls._by_name.get(raw)
@classmethod
def all_cards(cls) -> List[ModelCard]:
return list(cls._by_name.values())
# ── serialisers ───────────────────────────────────
@classmethod
def openai_list(cls) -> dict:
"""GET /v1/models — OpenAI-compatible."""
return {
"object": "list",
"data": [
{
"id": c.name,
"object": "model",
"created": 1700000000,
"owned_by": c.author.lower().replace(" ", "-"),
}
for c in cls._by_name.values()
],
}
@classmethod
def internal_list(cls) -> dict:
"""GET /api/internal/v1/models — rich, grouped by family."""
fam: Dict[str, list] = {}
for c in cls._by_name.values():
fam.setdefault(c.family, []).append(
{
"id": c.name,
"name": c.real_name,
"author": c.author,
"backend_id": c.model_id,
}
)
return {
"server": "DevsDo API",
"version": "1.0.0",
"timestamp": int(time.time()),
"total": len(cls._by_name),
"families": [
{"family": fn, "count": len(ms), "models": ms}
for fn, ms in fam.items()
],
}
# ═══════════════════════════════════════════════════════════
# §3 — REGISTER ALL 52 MODELS
# ═══════════════════════════════════════════════════════════
Registry.add(
# ─── Flagship / Large ─────────────────────────────────
ModelCard("kimi-k2.5", "Kimi K2.5", "Moonshot AI", "Kimi", "@cf/moonshotai/kimi-k2.5"),
ModelCard("nemotron-120b", "Nemotron 3 120B A12B", "NVIDIA", "Nemotron", "@cf/nvidia/nemotron-3-120b-a12b"),
ModelCard("gpt-oss-120b", "GPT-OSS 120B", "OpenAI", "GPT-OSS", "@cf/openai/gpt-oss-120b"),
ModelCard("gpt-oss-20b", "GPT-OSS 20B", "OpenAI", "GPT-OSS", "@cf/openai/gpt-oss-20b"),
ModelCard("llama-3.3-70b", "LLaMA 3.3 70B Instruct FP8", "Meta", "LLaMA", "@cf/meta/llama-3.3-70b-instruct-fp8-fast"),
# ─── Meta LLaMA ───────────────────────────────────────
ModelCard("llama-4-scout", "LLaMA 4 Scout 17B 16E", "Meta", "LLaMA", "@cf/meta/llama-4-scout-17b-16e-instruct"),
ModelCard("llama-3.2-11b-vision","LLaMA 3.2 11B Vision", "Meta", "LLaMA", "@cf/meta/llama-3.2-11b-vision-instruct"),
ModelCard("llama-3.1-8b", "LLaMA 3.1 8B Fast", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fast"),
ModelCard("llama-3.1-8b-fp8", "LLaMA 3.1 8B FP8", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fp8"),
ModelCard("llama-3.1-8b-awq", "LLaMA 3.1 8B AWQ", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-awq"),
ModelCard("llama-3.2-3b", "LLaMA 3.2 3B", "Meta", "LLaMA", "@cf/meta/llama-3.2-3b-instruct"),
ModelCard("llama-3.2-1b", "LLaMA 3.2 1B", "Meta", "LLaMA", "@cf/meta/llama-3.2-1b-instruct"),
ModelCard("llama-3-8b", "LLaMA 3 8B", "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct"),
ModelCard("llama-3-8b-awq", "LLaMA 3 8B AWQ", "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct-awq"),
ModelCard("llama-guard-3", "LLaMA Guard 3 8B", "Meta", "LLaMA", "@cf/meta/llama-guard-3-8b"),
ModelCard("llama-2-7b-fp16", "LLaMA 2 7B FP16", "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-fp16"),
ModelCard("llama-2-7b-int8", "LLaMA 2 7B INT8", "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-int8"),
ModelCard("llama-2-7b-lora", "LLaMA 2 7B LoRA", "Meta", "LLaMA", "@cf/meta-llama/llama-2-7b-chat-hf-lora"),
ModelCard("llama-2-13b", "LLaMA 2 13B AWQ", "Meta", "LLaMA", "@hf/thebloke/llama-2-13b-chat-awq"),
# ─── Qwen ─────────────────────────────────────────────
ModelCard("qwq-32b", "QwQ 32B", "Qwen", "Qwen", "@cf/qwen/qwq-32b"),
ModelCard("qwen-coder-32b", "Qwen 2.5 Coder 32B", "Qwen", "Qwen", "@cf/qwen/qwen2.5-coder-32b-instruct"),
ModelCard("qwen3-30b", "Qwen 3 30B A3B FP8", "Qwen", "Qwen", "@cf/qwen/qwen3-30b-a3b-fp8"),
ModelCard("qwen1.5-14b", "Qwen 1.5 14B AWQ", "Qwen", "Qwen", "@cf/qwen/qwen1.5-14b-chat-awq"),
ModelCard("qwen1.5-7b", "Qwen 1.5 7B AWQ", "Qwen", "Qwen", "@cf/qwen/qwen1.5-7b-chat-awq"),
ModelCard("qwen1.5-1.8b", "Qwen 1.5 1.8B", "Qwen", "Qwen", "@cf/qwen/qwen1.5-1.8b-chat"),
ModelCard("qwen1.5-0.5b", "Qwen 1.5 0.5B", "Qwen", "Qwen", "@cf/qwen/qwen1.5-0.5b-chat"),
# ─── DeepSeek ─────────────────────────────────────────
ModelCard("deepseek-r1", "DeepSeek R1 Distill Qwen 32B", "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b"),
ModelCard("deepseek-math", "DeepSeek Math 7B", "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-math-7b-instruct"),
ModelCard("deepseek-coder-base", "DeepSeek Coder 6.7B Base", "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-base-awq"),
ModelCard("deepseek-coder", "DeepSeek Coder 6.7B Instruct", "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-instruct-awq"),
# ─── Google Gemma ─────────────────────────────────────
ModelCard("gemma-3-12b", "Gemma 3 12B IT", "Google", "Gemma", "@cf/google/gemma-3-12b-it"),
ModelCard("gemma-7b", "Gemma 7B IT", "Google", "Gemma", "@hf/google/gemma-7b-it"),
ModelCard("gemma-2b-lora", "Gemma 2B IT LoRA", "Google", "Gemma", "@cf/google/gemma-2b-it-lora"),
ModelCard("gemma-7b-lora", "Gemma 7B IT LoRA", "Google", "Gemma", "@cf/google/gemma-7b-it-lora"),
# ─── Mistral ──────────────────────────────────────────
ModelCard("mistral-small-3.1", "Mistral Small 3.1 24B", "Mistral AI", "Mistral", "@cf/mistralai/mistral-small-3.1-24b-instruct"),
ModelCard("mistral-v0.2", "Mistral 7B v0.2", "Mistral AI", "Mistral", "@hf/mistral/mistral-7b-instruct-v0.2"),
ModelCard("mistral-v0.2-lora", "Mistral 7B v0.2 LoRA", "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.2-lora"),
ModelCard("mistral-v0.1", "Mistral 7B v0.1", "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.1"),
ModelCard("mistral-v0.1-awq", "Mistral 7B v0.1 AWQ", "Mistral AI", "Mistral", "@hf/thebloke/mistral-7b-instruct-v0.1-awq"),
# ─── IBM Granite ──────────────────────────────────────
ModelCard("granite-4.0", "Granite 4.0 H Micro", "IBM", "Granite", "@cf/ibm-granite/granite-4.0-h-micro"),
# ─── ZhipuAI GLM ─────────────────────────────────────
ModelCard("glm-4.7-flash", "GLM 4.7 Flash", "ZhipuAI", "GLM", "@cf/zai-org/glm-4.7-flash"),
# ─── AI Singapore ─────────────────────────────────────
ModelCard("sea-lion-27b", "SEA-LION v4 27B", "AI Singapore", "SEA-LION", "@cf/aisingapore/gemma-sea-lion-v4-27b-it"),
# ─── Community / Other ────────────────────────────────
ModelCard("hermes-2-pro", "Hermes 2 Pro Mistral 7B", "NousResearch", "Hermes", "@hf/nousresearch/hermes-2-pro-mistral-7b"),
ModelCard("openhermes-2.5", "OpenHermes 2.5 Mistral 7B", "NousResearch", "Hermes", "@hf/thebloke/openhermes-2.5-mistral-7b-awq"),
ModelCard("starling-7b", "Starling LM 7B Beta", "Nexusflow", "Starling", "@hf/nexusflow/starling-lm-7b-beta"),
ModelCard("neural-chat-7b", "Neural Chat 7B v3.1", "Intel", "Neural Chat", "@hf/thebloke/neural-chat-7b-v3-1-awq"),
ModelCard("openchat-3.5", "OpenChat 3.5", "OpenChat", "OpenChat", "@cf/openchat/openchat-3.5-0106"),
ModelCard("cybertron-7b", "UNA Cybertron 7B v2", "fblgit", "Cybertron", "@cf/fblgit/una-cybertron-7b-v2-bf16"),
ModelCard("discolm-german-7b", "DiscoLM German 7B", "TheBloke", "DiscoLM", "@cf/thebloke/discolm-german-7b-v1-awq"),
ModelCard("zephyr-7b", "Zephyr 7B Beta", "HuggingFace", "Zephyr", "@hf/thebloke/zephyr-7b-beta-awq"),
ModelCard("falcon-7b", "Falcon 7B Instruct", "TII UAE", "Falcon", "@cf/tiiuae/falcon-7b-instruct"),
ModelCard("tinyllama-1.1b", "TinyLlama 1.1B Chat", "TinyLlama", "TinyLlama", "@cf/tinyllama/tinyllama-1.1b-chat-v1.0"),
ModelCard("phi-2", "Phi 2", "Microsoft", "Phi", "@cf/microsoft/phi-2"),
ModelCard("sqlcoder", "SQLCoder 7B 2", "Defog", "SQLCoder", "@cf/defog/sqlcoder-7b-2"),
)
# ═══════════════════════════════════════════════════════════
# §4 — THINK-TAG STREAM PARSER
#
# Detects <think>…</think> across chunked tokens.
# Yields ("reasoning", text) or ("content", text).
# Handles tags split across multiple SSE tokens.
# ═══════════════════════════════════════════════════════════
class ThinkParser:
__slots__ = ("thinking", "buf")
OPEN = "<think>" # 7 chars
CLOSE = "</think>" # 8 chars
def __init__(self):
self.thinking = False
self.buf = ""
# ── feed one token, get classified fragments ──────
def feed(self, token: str) -> list[tuple[str, str]]:
self.buf += token
out: list[tuple[str, str]] = []
while self.buf:
tag = self.CLOSE if self.thinking else self.OPEN
kind = "reasoning" if self.thinking else "content"
idx = self.buf.find(tag)
if idx >= 0:
# full tag found — emit text before, flip state
if idx > 0:
out.append((kind, self.buf[:idx]))
self.buf = self.buf[idx + len(tag) :]
self.thinking = not self.thinking
continue
# no full tag — check for partial tag stuck at end
held = self._partial(tag)
if held:
safe = self.buf[: -len(held)]
if safe:
out.append((kind, safe))
self.buf = held
else:
out.append((kind, self.buf))
self.buf = ""
break
return out
# ── drain remaining buffer at stream end ──────────
def flush(self) -> list[tuple[str, str]]:
if not self.buf:
return []
kind = "reasoning" if self.thinking else "content"
r = [(kind, self.buf)]
self.buf = ""
return r
# ── helper: longest suffix of buf that is a prefix of tag
def _partial(self, tag: str) -> str:
for i in range(min(len(tag) - 1, len(self.buf)), 0, -1):
if self.buf[-i:] == tag[:i]:
return self.buf[-i:]
return ""
# ═══════════════════════════════════════════════════════════
# §5 — BACKEND CLIENT
#
# Talks to the Cloudflare AI proxy hosted on HF Spaces.
# Parses upstream SSE and yields raw string tokens.
# Retries on transient HTTP errors.
# ═══════════════════════════════════════════════════════════
_BACKEND = "https://adarshu07-ls.hf.space"
_BACKEND_URL = f"{_BACKEND}/v1/chat/completions"
_RETRYABLE = frozenset({429, 500, 502, 503, 504, 520, 521, 522, 523, 524})
_FATAL = frozenset({400, 401, 403, 404, 405, 422})
_BE_HEADERS = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Content-Type": "application/json",
"Origin": _BACKEND,
"Referer": f"{_BACKEND}/docs",
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
),
}
def _parse_sse(line: str) -> tuple[str, bool]:
"""One SSE data: line → (token_text, is_done)."""
line = line.strip()
if not line.startswith("data:"):
return "", False
payload = line[5:].strip()
if payload == "[DONE]":
return "", True
try:
obj = json.loads(payload)
if "error" in obj:
return "", True
delta = obj.get("choices", [{}])[0].get("delta", {})
return delta.get("content", "") or "", False
except (json.JSONDecodeError, KeyError, IndexError):
return "", False
async def backend_stream(
session: aiohttp.ClientSession,
messages: list[dict],
model_id: str,
temperature: float = 0.7,
max_tokens: int = 4096,
timeout: int = 180,
retries: int = 2,
) -> AsyncGenerator[str, None]:
"""POST → upstream, parse SSE, yield raw tokens."""
body: dict = {
"model": model_id,
"messages": messages,
"stream": True,
"temperature": temperature,
}
if max_tokens:
body["max_tokens"] = max_tokens
last_err = ""
for attempt in range(1 + retries):
try:
async with session.post(
_BACKEND_URL,
json=body,
timeout=aiohttp.ClientTimeout(
total=timeout,
sock_connect=30,
sock_read=timeout,
),
) as resp:
if resp.status == 200:
while True:
raw = await resp.content.readline()
if not raw:
break
line = raw.decode("utf-8", errors="replace")
if not line.strip():
continue
tok, done = _parse_sse(line)
if done:
return
if tok:
yield tok
return
text = await resp.text()
last_err = f"HTTP {resp.status}: {text[:300]}"
if resp.status in _FATAL:
raise RuntimeError(last_err)
if resp.status in _RETRYABLE and attempt < retries:
wait = min(2.0 * (attempt + 1) + random.random(), 15)
log.warning(f"Retry {attempt+1}/{retries} in {wait:.1f}s — {last_err}")
await asyncio.sleep(wait)
continue
raise RuntimeError(last_err)
except (RuntimeError, GeneratorExit):
raise
except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as exc:
last_err = str(exc)
if attempt < retries:
log.warning(f"Retry {attempt+1}/{retries}{last_err}")
await asyncio.sleep(1.5 * (attempt + 1))
continue
raise RuntimeError(f"Backend unreachable: {last_err}") from exc
raise RuntimeError(f"All retries exhausted: {last_err}")
# ═══════════════════════════════════════════════════════════
# §6 — FASTAPI APP + LIFESPAN
# ═══════════════════════════════════════════════════════════
@asynccontextmanager
async def lifespan(app: FastAPI):
# ── startup ───────────────────────────────────────
connector = aiohttp.TCPConnector(
resolver=aiohttp.resolver.ThreadedResolver(),
limit=100,
limit_per_host=15,
ttl_dns_cache=300,
keepalive_timeout=60,
enable_cleanup_closed=True,
)
app.state.http = aiohttp.ClientSession(
connector=connector,
headers=_BE_HEADERS,
)
log.info("══════════════════════════════════════════")
log.info(" ⚡ DevsDo API Server v1.0.0")
log.info(f" Models : {len(Registry.all_cards())}")
log.info(f" Backend: {_BACKEND}")
log.info(f" Port : 7860")
log.info("══════════════════════════════════════════")
yield
# ── shutdown ──────────────────────────────────────
await app.state.http.close()
log.info("Server stopped ✓")
app = FastAPI(
title="⚡ DevsDo API",
description="OpenAI-compatible · 52 Models · Streaming · Reasoning",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ═══════════════════════════════════════════════════════════
# §7 — PYDANTIC SCHEMAS
# ═══════════════════════════════════════════════════════════
class Message(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
model: str = "kimi-k2.5"
messages: list[Message] = Field(..., min_length=1)
stream: bool = False
temperature: float = Field(default=0.7, ge=0.0, le=2.0)
max_tokens: Optional[int] = Field(default=4096, ge=1)
# ═══════════════════════════════════════════════════════════
# §8 — ROUTES
# ═══════════════════════════════════════════════════════════
def _cid() -> str:
"""Generate a chat-completion ID."""
return f"chatcmpl-{uuid.uuid4().hex[:29]}"
def _sse(obj: Any) -> str:
"""Format one SSE frame."""
return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n"
# ── info ──────────────────────────────────────────────────
@app.get("/")
async def root():
return {
"service": "⚡ DevsDo API",
"version": "1.0.0",
"status": "running",
"models": len(Registry.all_cards()),
"docs": "/docs",
"endpoints": {
"health": "GET /health",
"models_openai": "GET /v1/models",
"models_detail": "GET /api/internal/v1/models",
"chat": "POST /v1/chat/completions",
},
}
@app.get("/health")
async def health():
return {
"status": "healthy",
"timestamp": int(time.time()),
"models": len(Registry.all_cards()),
"backend": _BACKEND,
}
# ── models ────────────────────────────────────────────────
@app.get("/v1/models")
async def models_openai():
"""OpenAI-compatible model list."""
return Registry.openai_list()
@app.get("/api/internal/v1/models")
async def models_internal():
"""Rich model registry grouped by family."""
return Registry.internal_list()
# ── chat completions ─────────────────────────────────────
@app.post("/v1/chat/completions")
async def chat_completions(req: ChatRequest):
"""
OpenAI-compatible chat completions.
• stream=false → JSON (reasoning in `reasoning_content`)
• stream=true → SSE (reasoning chunks use `reasoning_content` in delta)
"""
model_id = Registry.resolve(req.model)
card = Registry.find(req.model)
display = card.name if card else req.model
msgs = [{"role": m.role, "content": m.content} for m in req.messages]
if req.stream:
return StreamingResponse(
_stream_gen(app.state.http, msgs, model_id, display,
req.temperature, req.max_tokens or 4096),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)
return await _complete(
app.state.http, msgs, model_id, display,
req.temperature, req.max_tokens or 4096,
)
# ═══════════════════════════════════════════════════════════
# §9 — SSE STREAM GENERATOR
#
# backend tokens → ThinkParser → OpenAI SSE chunks
#
# Reasoning tokens go into delta.reasoning_content
# Normal tokens go into delta.content
# ═══════════════════════════════════════════════════════════
async def _stream_gen(
session: aiohttp.ClientSession,
messages: list[dict],
model_id: str,
model_name: str,
temperature: float,
max_tokens: int,
) -> AsyncGenerator[str, None]:
cid = _cid()
ts = int(time.time())
parser = ThinkParser()
def _chunk(delta: dict, finish: Optional[str] = None) -> str:
return _sse({
"id": cid,
"object": "chat.completion.chunk",
"created": ts,
"model": model_name,
"choices": [{
"index": 0,
"delta": delta,
"finish_reason": finish,
}],
})
# ── role announcement ─────────────────────────────
yield _chunk({"role": "assistant"})
try:
async for token in backend_stream(
session, messages, model_id, temperature, max_tokens,
):
for kind, text in parser.feed(token):
if kind == "reasoning":
yield _chunk({"reasoning": text})
else:
yield _chunk({"content": text})
# ── flush parser buffer ───────────────────────
for kind, text in parser.flush():
if kind == "reasoning":
yield _chunk({"reasoning": text})
else:
yield _chunk({"content": text})
# ── stop ──────────────────────────────────────
yield _chunk({}, finish="stop")
yield "data: [DONE]\n\n"
except Exception as exc:
log.error(f"Stream error [{model_name}]: {exc}")
yield _chunk({"content": f"\n\n[Error: {exc}]"}, finish="error")
yield "data: [DONE]\n\n"
# ═══════════════════════════════════════════════════════════
# §10 — NON-STREAMING COLLECTOR
# ═══════════════════════════════════════════════════════════
async def _complete(
session: aiohttp.ClientSession,
messages: list[dict],
model_id: str,
model_name: str,
temperature: float,
max_tokens: int,
) -> dict:
"""Collect full response, separate reasoning vs content."""
parser = ThinkParser()
reasoning: list[str] = []
content: list[str] = []
try:
async for token in backend_stream(
session, messages, model_id, temperature, max_tokens,
):
for kind, text in parser.feed(token):
(reasoning if kind == "reasoning" else content).append(text)
for kind, text in parser.flush():
(reasoning if kind == "reasoning" else content).append(text)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"Backend error: {exc}")
msg: dict = {
"role": "assistant",
"content": "".join(content),
}
if reasoning:
msg["reasoning"] = "".join(reasoning)
total_chars = len(msg["content"]) + len(msg.get("reasoning", ""))
return {
"id": _cid(),
"object": "chat.completion",
"created": int(time.time()),
"model": model_name,
"choices": [{
"index": 0,
"message": msg,
"finish_reason": "stop",
}],
"usage": {
"prompt_tokens": 0,
"completion_tokens": total_chars // 4, # rough estimate
"total_tokens": total_chars // 4,
},
}
# ═══════════════════════════════════════════════════════════
# §11 — ENTRYPOINT
# ═══════════════════════════════════════════════════════════
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app:app",
host="0.0.0.0",
port=7860,
workers=1,
timeout_keep_alive=120,
log_level="info",
)