Spaces:

Adarshu07
/

Sec

Running

App Files Files Community

Sec / app.py

Adarshu07

Update app.py

bba37f3 verified about 18 hours ago

raw

history blame contribute delete

32 kB

	"""
	══════════════════════════════════════════════════════════════════
	⚡ DevsDo API Server v1.0.0

	OpenAI-compatible · 52 Models · Cloudflare AI Backend
	SSE Streaming · <think> Reasoning · Zero API Keys

	Sections
	────────
	§1 Logging
	§2 Model Registry (g4f-style)
	§3 Register All 52 Models
	§4 Think-Tag Stream Parser
	§5 Backend Client (SSE → raw tokens)
	§6 FastAPI App + Lifespan
	§7 Pydantic Schemas
	§8 Routes
	§9 Stream Generator (tokens → OpenAI SSE)
	§10 Non-Stream Collector
	§11 Entrypoint
	══════════════════════════════════════════════════════════════════
	"""

	from __future__ import annotations

	import json, time, uuid, asyncio, random, logging
	from contextlib import asynccontextmanager
	from dataclasses import dataclass, asdict
	from typing import Optional, AsyncGenerator, Dict, List, Any

	import aiohttp
	import aiohttp.resolver
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import StreamingResponse
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel, Field


	# ═══════════════════════════════════════════════════════════
	# §1 — LOGGING
	# ═══════════════════════════════════════════════════════════

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s │ %(levelname)-7s │ %(message)s",
	datefmt="%H:%M:%S",
	)
	log = logging.getLogger("devsdo")


	# ═══════════════════════════════════════════════════════════
	# §2 — MODEL REGISTRY (g4f-style)
	#
	# Each model carries:
	# name – short route alias ("deepseek-r1")
	# real_name – human display name ("DeepSeek R1 Distill Qwen 32B")
	# author – organisation ("DeepSeek")
	# family – model family group ("DeepSeek")
	# model_id – backend @cf/@hf ID ("@cf/deepseek-ai/…")
	# ═══════════════════════════════════════════════════════════

	@dataclass(frozen=True, slots=True)
	class ModelCard:
	name: str
	real_name: str
	author: str
	family: str
	model_id: str


	class Registry:
	"""Central model store — register once, resolve anywhere."""

	_by_name: Dict[str, ModelCard] = {}
	_by_id: Dict[str, ModelCard] = {}
	_default: str = ""

	# ── mutators ──────────────────────────────────────
	@classmethod
	def add(cls, *cards: ModelCard):
	for c in cards:
	cls._by_name[c.name] = c
	cls._by_id[c.model_id] = c
	if not cls._default:
	cls._default = c.name

	# ── lookups ───────────────────────────────────────
	@classmethod
	def resolve(cls, raw: Optional[str]) -> str:
	"""Alias / full-id / fuzzy → backend model_id."""
	if not raw:
	return cls._by_name[cls._default].model_id
	raw = raw.strip()
	for pfx in ("devsdo/", "devsdo:", "cloudflare/", "cf/"):
	if raw.lower().startswith(pfx):
	raw = raw[len(pfx):]
	break
	if raw.startswith(("@cf/", "@hf/")):
	return raw
	if raw in cls._by_name:
	return cls._by_name[raw].model_id
	low = raw.lower()
	for alias, card in cls._by_name.items():
	if low in alias or low in card.model_id.lower():
	return card.model_id
	return raw # pass-through

	@classmethod
	def find(cls, raw: str) -> Optional[ModelCard]:
	mid = cls.resolve(raw)
	return cls._by_id.get(mid) or cls._by_name.get(raw)

	@classmethod
	def all_cards(cls) -> List[ModelCard]:
	return list(cls._by_name.values())

	# ── serialisers ───────────────────────────────────
	@classmethod
	def openai_list(cls) -> dict:
	"""GET /v1/models — OpenAI-compatible."""
	return {
	"object": "list",
	"data": [
	{
	"id": c.name,
	"object": "model",
	"created": 1700000000,
	"owned_by": c.author.lower().replace(" ", "-"),
	}
	for c in cls._by_name.values()
	],
	}

	@classmethod
	def internal_list(cls) -> dict:
	"""GET /api/internal/v1/models — rich, grouped by family."""
	fam: Dict[str, list] = {}
	for c in cls._by_name.values():
	fam.setdefault(c.family, []).append(
	{
	"id": c.name,
	"name": c.real_name,
	"author": c.author,
	"backend_id": c.model_id,
	}
	)
	return {
	"server": "DevsDo API",
	"version": "1.0.0",
	"timestamp": int(time.time()),
	"total": len(cls._by_name),
	"families": [
	{"family": fn, "count": len(ms), "models": ms}
	for fn, ms in fam.items()
	],
	}


	# ═══════════════════════════════════════════════════════════
	# §3 — REGISTER ALL 52 MODELS
	# ═══════════════════════════════════════════════════════════

	Registry.add(
	# ─── Flagship / Large ─────────────────────────────────
	ModelCard("kimi-k2.5", "Kimi K2.5", "Moonshot AI", "Kimi", "@cf/moonshotai/kimi-k2.5"),
	ModelCard("nemotron-120b", "Nemotron 3 120B A12B", "NVIDIA", "Nemotron", "@cf/nvidia/nemotron-3-120b-a12b"),
	ModelCard("gpt-oss-120b", "GPT-OSS 120B", "OpenAI", "GPT-OSS", "@cf/openai/gpt-oss-120b"),
	ModelCard("gpt-oss-20b", "GPT-OSS 20B", "OpenAI", "GPT-OSS", "@cf/openai/gpt-oss-20b"),
	ModelCard("llama-3.3-70b", "LLaMA 3.3 70B Instruct FP8", "Meta", "LLaMA", "@cf/meta/llama-3.3-70b-instruct-fp8-fast"),

	# ─── Meta LLaMA ───────────────────────────────────────
	ModelCard("llama-4-scout", "LLaMA 4 Scout 17B 16E", "Meta", "LLaMA", "@cf/meta/llama-4-scout-17b-16e-instruct"),
	ModelCard("llama-3.2-11b-vision","LLaMA 3.2 11B Vision", "Meta", "LLaMA", "@cf/meta/llama-3.2-11b-vision-instruct"),
	ModelCard("llama-3.1-8b", "LLaMA 3.1 8B Fast", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fast"),
	ModelCard("llama-3.1-8b-fp8", "LLaMA 3.1 8B FP8", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fp8"),
	ModelCard("llama-3.1-8b-awq", "LLaMA 3.1 8B AWQ", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-awq"),
	ModelCard("llama-3.2-3b", "LLaMA 3.2 3B", "Meta", "LLaMA", "@cf/meta/llama-3.2-3b-instruct"),
	ModelCard("llama-3.2-1b", "LLaMA 3.2 1B", "Meta", "LLaMA", "@cf/meta/llama-3.2-1b-instruct"),
	ModelCard("llama-3-8b", "LLaMA 3 8B", "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct"),
	ModelCard("llama-3-8b-awq", "LLaMA 3 8B AWQ", "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct-awq"),
	ModelCard("llama-guard-3", "LLaMA Guard 3 8B", "Meta", "LLaMA", "@cf/meta/llama-guard-3-8b"),
	ModelCard("llama-2-7b-fp16", "LLaMA 2 7B FP16", "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-fp16"),
	ModelCard("llama-2-7b-int8", "LLaMA 2 7B INT8", "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-int8"),
	ModelCard("llama-2-7b-lora", "LLaMA 2 7B LoRA", "Meta", "LLaMA", "@cf/meta-llama/llama-2-7b-chat-hf-lora"),
	ModelCard("llama-2-13b", "LLaMA 2 13B AWQ", "Meta", "LLaMA", "@hf/thebloke/llama-2-13b-chat-awq"),

	# ─── Qwen ─────────────────────────────────────────────
	ModelCard("qwq-32b", "QwQ 32B", "Qwen", "Qwen", "@cf/qwen/qwq-32b"),
	ModelCard("qwen-coder-32b", "Qwen 2.5 Coder 32B", "Qwen", "Qwen", "@cf/qwen/qwen2.5-coder-32b-instruct"),
	ModelCard("qwen3-30b", "Qwen 3 30B A3B FP8", "Qwen", "Qwen", "@cf/qwen/qwen3-30b-a3b-fp8"),
	ModelCard("qwen1.5-14b", "Qwen 1.5 14B AWQ", "Qwen", "Qwen", "@cf/qwen/qwen1.5-14b-chat-awq"),
	ModelCard("qwen1.5-7b", "Qwen 1.5 7B AWQ", "Qwen", "Qwen", "@cf/qwen/qwen1.5-7b-chat-awq"),
	ModelCard("qwen1.5-1.8b", "Qwen 1.5 1.8B", "Qwen", "Qwen", "@cf/qwen/qwen1.5-1.8b-chat"),
	ModelCard("qwen1.5-0.5b", "Qwen 1.5 0.5B", "Qwen", "Qwen", "@cf/qwen/qwen1.5-0.5b-chat"),

	# ─── DeepSeek ─────────────────────────────────────────
	ModelCard("deepseek-r1", "DeepSeek R1 Distill Qwen 32B", "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b"),
	ModelCard("deepseek-math", "DeepSeek Math 7B", "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-math-7b-instruct"),
	ModelCard("deepseek-coder-base", "DeepSeek Coder 6.7B Base", "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-base-awq"),
	ModelCard("deepseek-coder", "DeepSeek Coder 6.7B Instruct", "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-instruct-awq"),

	# ─── Google Gemma ─────────────────────────────────────
	ModelCard("gemma-3-12b", "Gemma 3 12B IT", "Google", "Gemma", "@cf/google/gemma-3-12b-it"),
	ModelCard("gemma-7b", "Gemma 7B IT", "Google", "Gemma", "@hf/google/gemma-7b-it"),
	ModelCard("gemma-2b-lora", "Gemma 2B IT LoRA", "Google", "Gemma", "@cf/google/gemma-2b-it-lora"),
	ModelCard("gemma-7b-lora", "Gemma 7B IT LoRA", "Google", "Gemma", "@cf/google/gemma-7b-it-lora"),

	# ─── Mistral ──────────────────────────────────────────
	ModelCard("mistral-small-3.1", "Mistral Small 3.1 24B", "Mistral AI", "Mistral", "@cf/mistralai/mistral-small-3.1-24b-instruct"),
	ModelCard("mistral-v0.2", "Mistral 7B v0.2", "Mistral AI", "Mistral", "@hf/mistral/mistral-7b-instruct-v0.2"),
	ModelCard("mistral-v0.2-lora", "Mistral 7B v0.2 LoRA", "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.2-lora"),
	ModelCard("mistral-v0.1", "Mistral 7B v0.1", "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.1"),
	ModelCard("mistral-v0.1-awq", "Mistral 7B v0.1 AWQ", "Mistral AI", "Mistral", "@hf/thebloke/mistral-7b-instruct-v0.1-awq"),

	# ─── IBM Granite ──────────────────────────────────────
	ModelCard("granite-4.0", "Granite 4.0 H Micro", "IBM", "Granite", "@cf/ibm-granite/granite-4.0-h-micro"),

	# ─── ZhipuAI GLM ─────────────────────────────────────
	ModelCard("glm-4.7-flash", "GLM 4.7 Flash", "ZhipuAI", "GLM", "@cf/zai-org/glm-4.7-flash"),

	# ─── AI Singapore ─────────────────────────────────────
	ModelCard("sea-lion-27b", "SEA-LION v4 27B", "AI Singapore", "SEA-LION", "@cf/aisingapore/gemma-sea-lion-v4-27b-it"),

	# ─── Community / Other ────────────────────────────────
	ModelCard("hermes-2-pro", "Hermes 2 Pro Mistral 7B", "NousResearch", "Hermes", "@hf/nousresearch/hermes-2-pro-mistral-7b"),
	ModelCard("openhermes-2.5", "OpenHermes 2.5 Mistral 7B", "NousResearch", "Hermes", "@hf/thebloke/openhermes-2.5-mistral-7b-awq"),
	ModelCard("starling-7b", "Starling LM 7B Beta", "Nexusflow", "Starling", "@hf/nexusflow/starling-lm-7b-beta"),
	ModelCard("neural-chat-7b", "Neural Chat 7B v3.1", "Intel", "Neural Chat", "@hf/thebloke/neural-chat-7b-v3-1-awq"),
	ModelCard("openchat-3.5", "OpenChat 3.5", "OpenChat", "OpenChat", "@cf/openchat/openchat-3.5-0106"),
	ModelCard("cybertron-7b", "UNA Cybertron 7B v2", "fblgit", "Cybertron", "@cf/fblgit/una-cybertron-7b-v2-bf16"),
	ModelCard("discolm-german-7b", "DiscoLM German 7B", "TheBloke", "DiscoLM", "@cf/thebloke/discolm-german-7b-v1-awq"),
	ModelCard("zephyr-7b", "Zephyr 7B Beta", "HuggingFace", "Zephyr", "@hf/thebloke/zephyr-7b-beta-awq"),
	ModelCard("falcon-7b", "Falcon 7B Instruct", "TII UAE", "Falcon", "@cf/tiiuae/falcon-7b-instruct"),
	ModelCard("tinyllama-1.1b", "TinyLlama 1.1B Chat", "TinyLlama", "TinyLlama", "@cf/tinyllama/tinyllama-1.1b-chat-v1.0"),
	ModelCard("phi-2", "Phi 2", "Microsoft", "Phi", "@cf/microsoft/phi-2"),
	ModelCard("sqlcoder", "SQLCoder 7B 2", "Defog", "SQLCoder", "@cf/defog/sqlcoder-7b-2"),
	)


	# ═══════════════════════════════════════════════════════════
	# §4 — THINK-TAG STREAM PARSER
	#
	# Detects <think>…</think> across chunked tokens.
	# Yields ("reasoning", text) or ("content", text).
	# Handles tags split across multiple SSE tokens.
	# ═══════════════════════════════════════════════════════════

	class ThinkParser:
	__slots__ = ("thinking", "buf")

	OPEN = "<think>" # 7 chars
	CLOSE = "</think>" # 8 chars

	def __init__(self):
	self.thinking = False
	self.buf = ""

	# ── feed one token, get classified fragments ──────
	def feed(self, token: str) -> list[tuple[str, str]]:
	self.buf += token
	out: list[tuple[str, str]] = []

	while self.buf:
	tag = self.CLOSE if self.thinking else self.OPEN
	kind = "reasoning" if self.thinking else "content"

	idx = self.buf.find(tag)
	if idx >= 0:
	# full tag found — emit text before, flip state
	if idx > 0:
	out.append((kind, self.buf[:idx]))
	self.buf = self.buf[idx + len(tag) :]
	self.thinking = not self.thinking
	continue

	# no full tag — check for partial tag stuck at end
	held = self._partial(tag)
	if held:
	safe = self.buf[: -len(held)]
	if safe:
	out.append((kind, safe))
	self.buf = held
	else:
	out.append((kind, self.buf))
	self.buf = ""
	break

	return out

	# ── drain remaining buffer at stream end ──────────
	def flush(self) -> list[tuple[str, str]]:
	if not self.buf:
	return []
	kind = "reasoning" if self.thinking else "content"
	r = [(kind, self.buf)]
	self.buf = ""
	return r

	# ── helper: longest suffix of buf that is a prefix of tag
	def _partial(self, tag: str) -> str:
	for i in range(min(len(tag) - 1, len(self.buf)), 0, -1):
	if self.buf[-i:] == tag[:i]:
	return self.buf[-i:]
	return ""


	# ═══════════════════════════════════════════════════════════
	# §5 — BACKEND CLIENT
	#
	# Talks to the Cloudflare AI proxy hosted on HF Spaces.
	# Parses upstream SSE and yields raw string tokens.
	# Retries on transient HTTP errors.
	# ═══════════════════════════════════════════════════════════

	_BACKEND = "https://adarshu07-ls.hf.space"
	_BACKEND_URL = f"{_BACKEND}/v1/chat/completions"

	_RETRYABLE = frozenset({429, 500, 502, 503, 504, 520, 521, 522, 523, 524})
	_FATAL = frozenset({400, 401, 403, 404, 405, 422})

	_BE_HEADERS = {
	"Accept": "application/json",
	"Accept-Encoding": "gzip, deflate, br",
	"Content-Type": "application/json",
	"Origin": _BACKEND,
	"Referer": f"{_BACKEND}/docs",
	"User-Agent": (
	"Mozilla/5.0 (X11; Linux x86_64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/131.0.0.0 Safari/537.36"
	),
	}


	def _parse_sse(line: str) -> tuple[str, bool]:
	"""One SSE data: line → (token_text, is_done)."""
	line = line.strip()
	if not line.startswith("data:"):
	return "", False
	payload = line[5:].strip()
	if payload == "[DONE]":
	return "", True
	try:
	obj = json.loads(payload)
	if "error" in obj:
	return "", True
	delta = obj.get("choices", [{}])[0].get("delta", {})
	return delta.get("content", "") or "", False
	except (json.JSONDecodeError, KeyError, IndexError):
	return "", False


	async def backend_stream(
	session: aiohttp.ClientSession,
	messages: list[dict],
	model_id: str,
	temperature: float = 0.7,
	max_tokens: int = 4096,
	timeout: int = 180,
	retries: int = 2,
	) -> AsyncGenerator[str, None]:
	"""POST → upstream, parse SSE, yield raw tokens."""

	body: dict = {
	"model": model_id,
	"messages": messages,
	"stream": True,
	"temperature": temperature,
	}
	if max_tokens:
	body["max_tokens"] = max_tokens

	last_err = ""

	for attempt in range(1 + retries):
	try:
	async with session.post(
	_BACKEND_URL,
	json=body,
	timeout=aiohttp.ClientTimeout(
	total=timeout,
	sock_connect=30,
	sock_read=timeout,
	),
	) as resp:

	if resp.status == 200:
	while True:
	raw = await resp.content.readline()
	if not raw:
	break
	line = raw.decode("utf-8", errors="replace")
	if not line.strip():
	continue
	tok, done = _parse_sse(line)
	if done:
	return
	if tok:
	yield tok
	return

	text = await resp.text()
	last_err = f"HTTP {resp.status}: {text[:300]}"

	if resp.status in _FATAL:
	raise RuntimeError(last_err)
	if resp.status in _RETRYABLE and attempt < retries:
	wait = min(2.0 * (attempt + 1) + random.random(), 15)
	log.warning(f"Retry {attempt+1}/{retries} in {wait:.1f}s — {last_err}")
	await asyncio.sleep(wait)
	continue
	raise RuntimeError(last_err)

	except (RuntimeError, GeneratorExit):
	raise
	except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as exc:
	last_err = str(exc)
	if attempt < retries:
	log.warning(f"Retry {attempt+1}/{retries} — {last_err}")
	await asyncio.sleep(1.5 * (attempt + 1))
	continue
	raise RuntimeError(f"Backend unreachable: {last_err}") from exc

	raise RuntimeError(f"All retries exhausted: {last_err}")


	# ═══════════════════════════════════════════════════════════
	# §6 — FASTAPI APP + LIFESPAN
	# ═══════════════════════════════════════════════════════════

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	# ── startup ───────────────────────────────────────
	connector = aiohttp.TCPConnector(
	resolver=aiohttp.resolver.ThreadedResolver(),
	limit=100,
	limit_per_host=15,
	ttl_dns_cache=300,
	keepalive_timeout=60,
	enable_cleanup_closed=True,
	)
	app.state.http = aiohttp.ClientSession(
	connector=connector,
	headers=_BE_HEADERS,
	)
	log.info("══════════════════════════════════════════")
	log.info(" ⚡ DevsDo API Server v1.0.0")
	log.info(f" Models : {len(Registry.all_cards())}")
	log.info(f" Backend: {_BACKEND}")
	log.info(f" Port : 7860")
	log.info("══════════════════════════════════════════")
	yield
	# ── shutdown ──────────────────────────────────────
	await app.state.http.close()
	log.info("Server stopped ✓")


	app = FastAPI(
	title="⚡ DevsDo API",
	description="OpenAI-compatible · 52 Models · Streaming · Reasoning",
	version="1.0.0",
	docs_url="/docs",
	redoc_url="/redoc",
	lifespan=lifespan,
	)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	# ═══════════════════════════════════════════════════════════
	# §7 — PYDANTIC SCHEMAS
	# ═══════════════════════════════════════════════════════════

	class Message(BaseModel):
	role: str
	content: str

	class ChatRequest(BaseModel):
	model: str = "kimi-k2.5"
	messages: list[Message] = Field(..., min_length=1)
	stream: bool = False
	temperature: float = Field(default=0.7, ge=0.0, le=2.0)
	max_tokens: Optional[int] = Field(default=4096, ge=1)


	# ═══════════════════════════════════════════════════════════
	# §8 — ROUTES
	# ═══════════════════════════════════════════════════════════

	def _cid() -> str:
	"""Generate a chat-completion ID."""
	return f"chatcmpl-{uuid.uuid4().hex[:29]}"

	def _sse(obj: Any) -> str:
	"""Format one SSE frame."""
	return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n"


	# ── info ──────────────────────────────────────────────────

	@app.get("/")
	async def root():
	return {
	"service": "⚡ DevsDo API",
	"version": "1.0.0",
	"status": "running",
	"models": len(Registry.all_cards()),
	"docs": "/docs",
	"endpoints": {
	"health": "GET /health",
	"models_openai": "GET /v1/models",
	"models_detail": "GET /api/internal/v1/models",
	"chat": "POST /v1/chat/completions",
	},
	}


	@app.get("/health")
	async def health():
	return {
	"status": "healthy",
	"timestamp": int(time.time()),
	"models": len(Registry.all_cards()),
	"backend": _BACKEND,
	}


	# ── models ────────────────────────────────────────────────

	@app.get("/v1/models")
	async def models_openai():
	"""OpenAI-compatible model list."""
	return Registry.openai_list()


	@app.get("/api/internal/v1/models")
	async def models_internal():
	"""Rich model registry grouped by family."""
	return Registry.internal_list()


	# ── chat completions ─────────────────────────────────────

	@app.post("/v1/chat/completions")
	async def chat_completions(req: ChatRequest):
	"""
	OpenAI-compatible chat completions.

	• stream=false → JSON (reasoning in `reasoning_content`)
	• stream=true → SSE (reasoning chunks use `reasoning_content` in delta)
	"""
	model_id = Registry.resolve(req.model)
	card = Registry.find(req.model)
	display = card.name if card else req.model

	msgs = [{"role": m.role, "content": m.content} for m in req.messages]

	if req.stream:
	return StreamingResponse(
	_stream_gen(app.state.http, msgs, model_id, display,
	req.temperature, req.max_tokens or 4096),
	media_type="text/event-stream",
	headers={
	"Cache-Control": "no-cache",
	"Connection": "keep-alive",
	"X-Accel-Buffering": "no",
	},
	)

	return await _complete(
	app.state.http, msgs, model_id, display,
	req.temperature, req.max_tokens or 4096,
	)


	# ═══════════════════════════════════════════════════════════
	# §9 — SSE STREAM GENERATOR
	#
	# backend tokens → ThinkParser → OpenAI SSE chunks
	#
	# Reasoning tokens go into delta.reasoning_content
	# Normal tokens go into delta.content
	# ═══════════════════════════════════════════════════════════

	async def _stream_gen(
	session: aiohttp.ClientSession,
	messages: list[dict],
	model_id: str,
	model_name: str,
	temperature: float,
	max_tokens: int,
	) -> AsyncGenerator[str, None]:

	cid = _cid()
	ts = int(time.time())
	parser = ThinkParser()

	def _chunk(delta: dict, finish: Optional[str] = None) -> str:
	return _sse({
	"id": cid,
	"object": "chat.completion.chunk",
	"created": ts,
	"model": model_name,
	"choices": [{
	"index": 0,
	"delta": delta,
	"finish_reason": finish,
	}],
	})

	# ── role announcement ─────────────────────────────
	yield _chunk({"role": "assistant"})

	try:
	async for token in backend_stream(
	session, messages, model_id, temperature, max_tokens,
	):
	for kind, text in parser.feed(token):
	if kind == "reasoning":
	yield _chunk({"reasoning": text})
	else:
	yield _chunk({"content": text})

	# ── flush parser buffer ───────────────────────
	for kind, text in parser.flush():
	if kind == "reasoning":
	yield _chunk({"reasoning": text})
	else:
	yield _chunk({"content": text})

	# ── stop ──────────────────────────────────────
	yield _chunk({}, finish="stop")
	yield "data: [DONE]\n\n"

	except Exception as exc:
	log.error(f"Stream error [{model_name}]: {exc}")
	yield _chunk({"content": f"\n\n[Error: {exc}]"}, finish="error")
	yield "data: [DONE]\n\n"


	# ═══════════════════════════════════════════════════════════
	# §10 — NON-STREAMING COLLECTOR
	# ═══════════════════════════════════════════════════════════

	async def _complete(
	session: aiohttp.ClientSession,
	messages: list[dict],
	model_id: str,
	model_name: str,
	temperature: float,
	max_tokens: int,
	) -> dict:
	"""Collect full response, separate reasoning vs content."""

	parser = ThinkParser()
	reasoning: list[str] = []
	content: list[str] = []

	try:
	async for token in backend_stream(
	session, messages, model_id, temperature, max_tokens,
	):
	for kind, text in parser.feed(token):
	(reasoning if kind == "reasoning" else content).append(text)

	for kind, text in parser.flush():
	(reasoning if kind == "reasoning" else content).append(text)

	except Exception as exc:
	raise HTTPException(status_code=502, detail=f"Backend error: {exc}")

	msg: dict = {
	"role": "assistant",
	"content": "".join(content),
	}
	if reasoning:
	msg["reasoning"] = "".join(reasoning)

	total_chars = len(msg["content"]) + len(msg.get("reasoning", ""))

	return {
	"id": _cid(),
	"object": "chat.completion",
	"created": int(time.time()),
	"model": model_name,
	"choices": [{
	"index": 0,
	"message": msg,
	"finish_reason": "stop",
	}],
	"usage": {
	"prompt_tokens": 0,
	"completion_tokens": total_chars // 4, # rough estimate
	"total_tokens": total_chars // 4,
	},
	}


	# ═══════════════════════════════════════════════════════════
	# §11 — ENTRYPOINT
	# ═══════════════════════════════════════════════════════════

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(
	"app:app",
	host="0.0.0.0",
	port=7860,
	workers=1,
	timeout_keep_alive=120,
	log_level="info",
	)