| """ |
| ══════════════════════════════════════════════════════════════════ |
| ⚡ DevsDo API Server v1.0.0 |
| |
| OpenAI-compatible · 52 Models · Cloudflare AI Backend |
| SSE Streaming · <think> Reasoning · Zero API Keys |
| |
| Sections |
| ──────── |
| §1 Logging |
| §2 Model Registry (g4f-style) |
| §3 Register All 52 Models |
| §4 Think-Tag Stream Parser |
| §5 Backend Client (SSE → raw tokens) |
| §6 FastAPI App + Lifespan |
| §7 Pydantic Schemas |
| §8 Routes |
| §9 Stream Generator (tokens → OpenAI SSE) |
| §10 Non-Stream Collector |
| §11 Entrypoint |
| ══════════════════════════════════════════════════════════════════ |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json, time, uuid, asyncio, random, logging |
| from contextlib import asynccontextmanager |
| from dataclasses import dataclass, asdict |
| from typing import Optional, AsyncGenerator, Dict, List, Any |
|
|
| import aiohttp |
| import aiohttp.resolver |
| from fastapi import FastAPI, HTTPException |
| from fastapi.responses import StreamingResponse |
| from fastapi.middleware.cors import CORSMiddleware |
| from pydantic import BaseModel, Field |
|
|
|
|
| |
| |
| |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s │ %(levelname)-7s │ %(message)s", |
| datefmt="%H:%M:%S", |
| ) |
| log = logging.getLogger("devsdo") |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| @dataclass(frozen=True, slots=True) |
| class ModelCard: |
| name: str |
| real_name: str |
| author: str |
| family: str |
| model_id: str |
|
|
|
|
| class Registry: |
| """Central model store — register once, resolve anywhere.""" |
|
|
| _by_name: Dict[str, ModelCard] = {} |
| _by_id: Dict[str, ModelCard] = {} |
| _default: str = "" |
|
|
| |
| @classmethod |
| def add(cls, *cards: ModelCard): |
| for c in cards: |
| cls._by_name[c.name] = c |
| cls._by_id[c.model_id] = c |
| if not cls._default: |
| cls._default = c.name |
|
|
| |
| @classmethod |
| def resolve(cls, raw: Optional[str]) -> str: |
| """Alias / full-id / fuzzy → backend model_id.""" |
| if not raw: |
| return cls._by_name[cls._default].model_id |
| raw = raw.strip() |
| for pfx in ("devsdo/", "devsdo:", "cloudflare/", "cf/"): |
| if raw.lower().startswith(pfx): |
| raw = raw[len(pfx):] |
| break |
| if raw.startswith(("@cf/", "@hf/")): |
| return raw |
| if raw in cls._by_name: |
| return cls._by_name[raw].model_id |
| low = raw.lower() |
| for alias, card in cls._by_name.items(): |
| if low in alias or low in card.model_id.lower(): |
| return card.model_id |
| return raw |
|
|
| @classmethod |
| def find(cls, raw: str) -> Optional[ModelCard]: |
| mid = cls.resolve(raw) |
| return cls._by_id.get(mid) or cls._by_name.get(raw) |
|
|
| @classmethod |
| def all_cards(cls) -> List[ModelCard]: |
| return list(cls._by_name.values()) |
|
|
| |
| @classmethod |
| def openai_list(cls) -> dict: |
| """GET /v1/models — OpenAI-compatible.""" |
| return { |
| "object": "list", |
| "data": [ |
| { |
| "id": c.name, |
| "object": "model", |
| "created": 1700000000, |
| "owned_by": c.author.lower().replace(" ", "-"), |
| } |
| for c in cls._by_name.values() |
| ], |
| } |
|
|
| @classmethod |
| def internal_list(cls) -> dict: |
| """GET /api/internal/v1/models — rich, grouped by family.""" |
| fam: Dict[str, list] = {} |
| for c in cls._by_name.values(): |
| fam.setdefault(c.family, []).append( |
| { |
| "id": c.name, |
| "name": c.real_name, |
| "author": c.author, |
| "backend_id": c.model_id, |
| } |
| ) |
| return { |
| "server": "DevsDo API", |
| "version": "1.0.0", |
| "timestamp": int(time.time()), |
| "total": len(cls._by_name), |
| "families": [ |
| {"family": fn, "count": len(ms), "models": ms} |
| for fn, ms in fam.items() |
| ], |
| } |
|
|
|
|
| |
| |
| |
|
|
| Registry.add( |
| |
| ModelCard("kimi-k2.5", "Kimi K2.5", "Moonshot AI", "Kimi", "@cf/moonshotai/kimi-k2.5"), |
| ModelCard("nemotron-120b", "Nemotron 3 120B A12B", "NVIDIA", "Nemotron", "@cf/nvidia/nemotron-3-120b-a12b"), |
| ModelCard("gpt-oss-120b", "GPT-OSS 120B", "OpenAI", "GPT-OSS", "@cf/openai/gpt-oss-120b"), |
| ModelCard("gpt-oss-20b", "GPT-OSS 20B", "OpenAI", "GPT-OSS", "@cf/openai/gpt-oss-20b"), |
| ModelCard("llama-3.3-70b", "LLaMA 3.3 70B Instruct FP8", "Meta", "LLaMA", "@cf/meta/llama-3.3-70b-instruct-fp8-fast"), |
|
|
| |
| ModelCard("llama-4-scout", "LLaMA 4 Scout 17B 16E", "Meta", "LLaMA", "@cf/meta/llama-4-scout-17b-16e-instruct"), |
| ModelCard("llama-3.2-11b-vision","LLaMA 3.2 11B Vision", "Meta", "LLaMA", "@cf/meta/llama-3.2-11b-vision-instruct"), |
| ModelCard("llama-3.1-8b", "LLaMA 3.1 8B Fast", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fast"), |
| ModelCard("llama-3.1-8b-fp8", "LLaMA 3.1 8B FP8", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fp8"), |
| ModelCard("llama-3.1-8b-awq", "LLaMA 3.1 8B AWQ", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-awq"), |
| ModelCard("llama-3.2-3b", "LLaMA 3.2 3B", "Meta", "LLaMA", "@cf/meta/llama-3.2-3b-instruct"), |
| ModelCard("llama-3.2-1b", "LLaMA 3.2 1B", "Meta", "LLaMA", "@cf/meta/llama-3.2-1b-instruct"), |
| ModelCard("llama-3-8b", "LLaMA 3 8B", "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct"), |
| ModelCard("llama-3-8b-awq", "LLaMA 3 8B AWQ", "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct-awq"), |
| ModelCard("llama-guard-3", "LLaMA Guard 3 8B", "Meta", "LLaMA", "@cf/meta/llama-guard-3-8b"), |
| ModelCard("llama-2-7b-fp16", "LLaMA 2 7B FP16", "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-fp16"), |
| ModelCard("llama-2-7b-int8", "LLaMA 2 7B INT8", "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-int8"), |
| ModelCard("llama-2-7b-lora", "LLaMA 2 7B LoRA", "Meta", "LLaMA", "@cf/meta-llama/llama-2-7b-chat-hf-lora"), |
| ModelCard("llama-2-13b", "LLaMA 2 13B AWQ", "Meta", "LLaMA", "@hf/thebloke/llama-2-13b-chat-awq"), |
|
|
| |
| ModelCard("qwq-32b", "QwQ 32B", "Qwen", "Qwen", "@cf/qwen/qwq-32b"), |
| ModelCard("qwen-coder-32b", "Qwen 2.5 Coder 32B", "Qwen", "Qwen", "@cf/qwen/qwen2.5-coder-32b-instruct"), |
| ModelCard("qwen3-30b", "Qwen 3 30B A3B FP8", "Qwen", "Qwen", "@cf/qwen/qwen3-30b-a3b-fp8"), |
| ModelCard("qwen1.5-14b", "Qwen 1.5 14B AWQ", "Qwen", "Qwen", "@cf/qwen/qwen1.5-14b-chat-awq"), |
| ModelCard("qwen1.5-7b", "Qwen 1.5 7B AWQ", "Qwen", "Qwen", "@cf/qwen/qwen1.5-7b-chat-awq"), |
| ModelCard("qwen1.5-1.8b", "Qwen 1.5 1.8B", "Qwen", "Qwen", "@cf/qwen/qwen1.5-1.8b-chat"), |
| ModelCard("qwen1.5-0.5b", "Qwen 1.5 0.5B", "Qwen", "Qwen", "@cf/qwen/qwen1.5-0.5b-chat"), |
|
|
| |
| ModelCard("deepseek-r1", "DeepSeek R1 Distill Qwen 32B", "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b"), |
| ModelCard("deepseek-math", "DeepSeek Math 7B", "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-math-7b-instruct"), |
| ModelCard("deepseek-coder-base", "DeepSeek Coder 6.7B Base", "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-base-awq"), |
| ModelCard("deepseek-coder", "DeepSeek Coder 6.7B Instruct", "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-instruct-awq"), |
|
|
| |
| ModelCard("gemma-3-12b", "Gemma 3 12B IT", "Google", "Gemma", "@cf/google/gemma-3-12b-it"), |
| ModelCard("gemma-7b", "Gemma 7B IT", "Google", "Gemma", "@hf/google/gemma-7b-it"), |
| ModelCard("gemma-2b-lora", "Gemma 2B IT LoRA", "Google", "Gemma", "@cf/google/gemma-2b-it-lora"), |
| ModelCard("gemma-7b-lora", "Gemma 7B IT LoRA", "Google", "Gemma", "@cf/google/gemma-7b-it-lora"), |
|
|
| |
| ModelCard("mistral-small-3.1", "Mistral Small 3.1 24B", "Mistral AI", "Mistral", "@cf/mistralai/mistral-small-3.1-24b-instruct"), |
| ModelCard("mistral-v0.2", "Mistral 7B v0.2", "Mistral AI", "Mistral", "@hf/mistral/mistral-7b-instruct-v0.2"), |
| ModelCard("mistral-v0.2-lora", "Mistral 7B v0.2 LoRA", "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.2-lora"), |
| ModelCard("mistral-v0.1", "Mistral 7B v0.1", "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.1"), |
| ModelCard("mistral-v0.1-awq", "Mistral 7B v0.1 AWQ", "Mistral AI", "Mistral", "@hf/thebloke/mistral-7b-instruct-v0.1-awq"), |
|
|
| |
| ModelCard("granite-4.0", "Granite 4.0 H Micro", "IBM", "Granite", "@cf/ibm-granite/granite-4.0-h-micro"), |
|
|
| |
| ModelCard("glm-4.7-flash", "GLM 4.7 Flash", "ZhipuAI", "GLM", "@cf/zai-org/glm-4.7-flash"), |
|
|
| |
| ModelCard("sea-lion-27b", "SEA-LION v4 27B", "AI Singapore", "SEA-LION", "@cf/aisingapore/gemma-sea-lion-v4-27b-it"), |
|
|
| |
| ModelCard("hermes-2-pro", "Hermes 2 Pro Mistral 7B", "NousResearch", "Hermes", "@hf/nousresearch/hermes-2-pro-mistral-7b"), |
| ModelCard("openhermes-2.5", "OpenHermes 2.5 Mistral 7B", "NousResearch", "Hermes", "@hf/thebloke/openhermes-2.5-mistral-7b-awq"), |
| ModelCard("starling-7b", "Starling LM 7B Beta", "Nexusflow", "Starling", "@hf/nexusflow/starling-lm-7b-beta"), |
| ModelCard("neural-chat-7b", "Neural Chat 7B v3.1", "Intel", "Neural Chat", "@hf/thebloke/neural-chat-7b-v3-1-awq"), |
| ModelCard("openchat-3.5", "OpenChat 3.5", "OpenChat", "OpenChat", "@cf/openchat/openchat-3.5-0106"), |
| ModelCard("cybertron-7b", "UNA Cybertron 7B v2", "fblgit", "Cybertron", "@cf/fblgit/una-cybertron-7b-v2-bf16"), |
| ModelCard("discolm-german-7b", "DiscoLM German 7B", "TheBloke", "DiscoLM", "@cf/thebloke/discolm-german-7b-v1-awq"), |
| ModelCard("zephyr-7b", "Zephyr 7B Beta", "HuggingFace", "Zephyr", "@hf/thebloke/zephyr-7b-beta-awq"), |
| ModelCard("falcon-7b", "Falcon 7B Instruct", "TII UAE", "Falcon", "@cf/tiiuae/falcon-7b-instruct"), |
| ModelCard("tinyllama-1.1b", "TinyLlama 1.1B Chat", "TinyLlama", "TinyLlama", "@cf/tinyllama/tinyllama-1.1b-chat-v1.0"), |
| ModelCard("phi-2", "Phi 2", "Microsoft", "Phi", "@cf/microsoft/phi-2"), |
| ModelCard("sqlcoder", "SQLCoder 7B 2", "Defog", "SQLCoder", "@cf/defog/sqlcoder-7b-2"), |
| ) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| class ThinkParser: |
| __slots__ = ("thinking", "buf") |
|
|
| OPEN = "<think>" |
| CLOSE = "</think>" |
|
|
| def __init__(self): |
| self.thinking = False |
| self.buf = "" |
|
|
| |
| def feed(self, token: str) -> list[tuple[str, str]]: |
| self.buf += token |
| out: list[tuple[str, str]] = [] |
|
|
| while self.buf: |
| tag = self.CLOSE if self.thinking else self.OPEN |
| kind = "reasoning" if self.thinking else "content" |
|
|
| idx = self.buf.find(tag) |
| if idx >= 0: |
| |
| if idx > 0: |
| out.append((kind, self.buf[:idx])) |
| self.buf = self.buf[idx + len(tag) :] |
| self.thinking = not self.thinking |
| continue |
|
|
| |
| held = self._partial(tag) |
| if held: |
| safe = self.buf[: -len(held)] |
| if safe: |
| out.append((kind, safe)) |
| self.buf = held |
| else: |
| out.append((kind, self.buf)) |
| self.buf = "" |
| break |
|
|
| return out |
|
|
| |
| def flush(self) -> list[tuple[str, str]]: |
| if not self.buf: |
| return [] |
| kind = "reasoning" if self.thinking else "content" |
| r = [(kind, self.buf)] |
| self.buf = "" |
| return r |
|
|
| |
| def _partial(self, tag: str) -> str: |
| for i in range(min(len(tag) - 1, len(self.buf)), 0, -1): |
| if self.buf[-i:] == tag[:i]: |
| return self.buf[-i:] |
| return "" |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| _BACKEND = "https://adarshu07-ls.hf.space" |
| _BACKEND_URL = f"{_BACKEND}/v1/chat/completions" |
|
|
| _RETRYABLE = frozenset({429, 500, 502, 503, 504, 520, 521, 522, 523, 524}) |
| _FATAL = frozenset({400, 401, 403, 404, 405, 422}) |
|
|
| _BE_HEADERS = { |
| "Accept": "application/json", |
| "Accept-Encoding": "gzip, deflate, br", |
| "Content-Type": "application/json", |
| "Origin": _BACKEND, |
| "Referer": f"{_BACKEND}/docs", |
| "User-Agent": ( |
| "Mozilla/5.0 (X11; Linux x86_64) " |
| "AppleWebKit/537.36 (KHTML, like Gecko) " |
| "Chrome/131.0.0.0 Safari/537.36" |
| ), |
| } |
|
|
|
|
| def _parse_sse(line: str) -> tuple[str, bool]: |
| """One SSE data: line → (token_text, is_done).""" |
| line = line.strip() |
| if not line.startswith("data:"): |
| return "", False |
| payload = line[5:].strip() |
| if payload == "[DONE]": |
| return "", True |
| try: |
| obj = json.loads(payload) |
| if "error" in obj: |
| return "", True |
| delta = obj.get("choices", [{}])[0].get("delta", {}) |
| return delta.get("content", "") or "", False |
| except (json.JSONDecodeError, KeyError, IndexError): |
| return "", False |
|
|
|
|
| async def backend_stream( |
| session: aiohttp.ClientSession, |
| messages: list[dict], |
| model_id: str, |
| temperature: float = 0.7, |
| max_tokens: int = 4096, |
| timeout: int = 180, |
| retries: int = 2, |
| ) -> AsyncGenerator[str, None]: |
| """POST → upstream, parse SSE, yield raw tokens.""" |
|
|
| body: dict = { |
| "model": model_id, |
| "messages": messages, |
| "stream": True, |
| "temperature": temperature, |
| } |
| if max_tokens: |
| body["max_tokens"] = max_tokens |
|
|
| last_err = "" |
|
|
| for attempt in range(1 + retries): |
| try: |
| async with session.post( |
| _BACKEND_URL, |
| json=body, |
| timeout=aiohttp.ClientTimeout( |
| total=timeout, |
| sock_connect=30, |
| sock_read=timeout, |
| ), |
| ) as resp: |
|
|
| if resp.status == 200: |
| while True: |
| raw = await resp.content.readline() |
| if not raw: |
| break |
| line = raw.decode("utf-8", errors="replace") |
| if not line.strip(): |
| continue |
| tok, done = _parse_sse(line) |
| if done: |
| return |
| if tok: |
| yield tok |
| return |
|
|
| text = await resp.text() |
| last_err = f"HTTP {resp.status}: {text[:300]}" |
|
|
| if resp.status in _FATAL: |
| raise RuntimeError(last_err) |
| if resp.status in _RETRYABLE and attempt < retries: |
| wait = min(2.0 * (attempt + 1) + random.random(), 15) |
| log.warning(f"Retry {attempt+1}/{retries} in {wait:.1f}s — {last_err}") |
| await asyncio.sleep(wait) |
| continue |
| raise RuntimeError(last_err) |
|
|
| except (RuntimeError, GeneratorExit): |
| raise |
| except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as exc: |
| last_err = str(exc) |
| if attempt < retries: |
| log.warning(f"Retry {attempt+1}/{retries} — {last_err}") |
| await asyncio.sleep(1.5 * (attempt + 1)) |
| continue |
| raise RuntimeError(f"Backend unreachable: {last_err}") from exc |
|
|
| raise RuntimeError(f"All retries exhausted: {last_err}") |
|
|
|
|
| |
| |
| |
|
|
| @asynccontextmanager |
| async def lifespan(app: FastAPI): |
| |
| connector = aiohttp.TCPConnector( |
| resolver=aiohttp.resolver.ThreadedResolver(), |
| limit=100, |
| limit_per_host=15, |
| ttl_dns_cache=300, |
| keepalive_timeout=60, |
| enable_cleanup_closed=True, |
| ) |
| app.state.http = aiohttp.ClientSession( |
| connector=connector, |
| headers=_BE_HEADERS, |
| ) |
| log.info("══════════════════════════════════════════") |
| log.info(" ⚡ DevsDo API Server v1.0.0") |
| log.info(f" Models : {len(Registry.all_cards())}") |
| log.info(f" Backend: {_BACKEND}") |
| log.info(f" Port : 7860") |
| log.info("══════════════════════════════════════════") |
| yield |
| |
| await app.state.http.close() |
| log.info("Server stopped ✓") |
|
|
|
|
| app = FastAPI( |
| title="⚡ DevsDo API", |
| description="OpenAI-compatible · 52 Models · Streaming · Reasoning", |
| version="1.0.0", |
| docs_url="/docs", |
| redoc_url="/redoc", |
| lifespan=lifespan, |
| ) |
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
|
|
| |
| |
| |
|
|
| class Message(BaseModel): |
| role: str |
| content: str |
|
|
| class ChatRequest(BaseModel): |
| model: str = "kimi-k2.5" |
| messages: list[Message] = Field(..., min_length=1) |
| stream: bool = False |
| temperature: float = Field(default=0.7, ge=0.0, le=2.0) |
| max_tokens: Optional[int] = Field(default=4096, ge=1) |
|
|
|
|
| |
| |
| |
|
|
| def _cid() -> str: |
| """Generate a chat-completion ID.""" |
| return f"chatcmpl-{uuid.uuid4().hex[:29]}" |
|
|
| def _sse(obj: Any) -> str: |
| """Format one SSE frame.""" |
| return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n" |
|
|
|
|
| |
|
|
| @app.get("/") |
| async def root(): |
| return { |
| "service": "⚡ DevsDo API", |
| "version": "1.0.0", |
| "status": "running", |
| "models": len(Registry.all_cards()), |
| "docs": "/docs", |
| "endpoints": { |
| "health": "GET /health", |
| "models_openai": "GET /v1/models", |
| "models_detail": "GET /api/internal/v1/models", |
| "chat": "POST /v1/chat/completions", |
| }, |
| } |
|
|
|
|
| @app.get("/health") |
| async def health(): |
| return { |
| "status": "healthy", |
| "timestamp": int(time.time()), |
| "models": len(Registry.all_cards()), |
| "backend": _BACKEND, |
| } |
|
|
|
|
| |
|
|
| @app.get("/v1/models") |
| async def models_openai(): |
| """OpenAI-compatible model list.""" |
| return Registry.openai_list() |
|
|
|
|
| @app.get("/api/internal/v1/models") |
| async def models_internal(): |
| """Rich model registry grouped by family.""" |
| return Registry.internal_list() |
|
|
|
|
| |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completions(req: ChatRequest): |
| """ |
| OpenAI-compatible chat completions. |
| |
| • stream=false → JSON (reasoning in `reasoning_content`) |
| • stream=true → SSE (reasoning chunks use `reasoning_content` in delta) |
| """ |
| model_id = Registry.resolve(req.model) |
| card = Registry.find(req.model) |
| display = card.name if card else req.model |
|
|
| msgs = [{"role": m.role, "content": m.content} for m in req.messages] |
|
|
| if req.stream: |
| return StreamingResponse( |
| _stream_gen(app.state.http, msgs, model_id, display, |
| req.temperature, req.max_tokens or 4096), |
| media_type="text/event-stream", |
| headers={ |
| "Cache-Control": "no-cache", |
| "Connection": "keep-alive", |
| "X-Accel-Buffering": "no", |
| }, |
| ) |
|
|
| return await _complete( |
| app.state.http, msgs, model_id, display, |
| req.temperature, req.max_tokens or 4096, |
| ) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| async def _stream_gen( |
| session: aiohttp.ClientSession, |
| messages: list[dict], |
| model_id: str, |
| model_name: str, |
| temperature: float, |
| max_tokens: int, |
| ) -> AsyncGenerator[str, None]: |
|
|
| cid = _cid() |
| ts = int(time.time()) |
| parser = ThinkParser() |
|
|
| def _chunk(delta: dict, finish: Optional[str] = None) -> str: |
| return _sse({ |
| "id": cid, |
| "object": "chat.completion.chunk", |
| "created": ts, |
| "model": model_name, |
| "choices": [{ |
| "index": 0, |
| "delta": delta, |
| "finish_reason": finish, |
| }], |
| }) |
|
|
| |
| yield _chunk({"role": "assistant"}) |
|
|
| try: |
| async for token in backend_stream( |
| session, messages, model_id, temperature, max_tokens, |
| ): |
| for kind, text in parser.feed(token): |
| if kind == "reasoning": |
| yield _chunk({"reasoning": text}) |
| else: |
| yield _chunk({"content": text}) |
|
|
| |
| for kind, text in parser.flush(): |
| if kind == "reasoning": |
| yield _chunk({"reasoning": text}) |
| else: |
| yield _chunk({"content": text}) |
|
|
| |
| yield _chunk({}, finish="stop") |
| yield "data: [DONE]\n\n" |
|
|
| except Exception as exc: |
| log.error(f"Stream error [{model_name}]: {exc}") |
| yield _chunk({"content": f"\n\n[Error: {exc}]"}, finish="error") |
| yield "data: [DONE]\n\n" |
|
|
|
|
| |
| |
| |
|
|
| async def _complete( |
| session: aiohttp.ClientSession, |
| messages: list[dict], |
| model_id: str, |
| model_name: str, |
| temperature: float, |
| max_tokens: int, |
| ) -> dict: |
| """Collect full response, separate reasoning vs content.""" |
|
|
| parser = ThinkParser() |
| reasoning: list[str] = [] |
| content: list[str] = [] |
|
|
| try: |
| async for token in backend_stream( |
| session, messages, model_id, temperature, max_tokens, |
| ): |
| for kind, text in parser.feed(token): |
| (reasoning if kind == "reasoning" else content).append(text) |
|
|
| for kind, text in parser.flush(): |
| (reasoning if kind == "reasoning" else content).append(text) |
|
|
| except Exception as exc: |
| raise HTTPException(status_code=502, detail=f"Backend error: {exc}") |
|
|
| msg: dict = { |
| "role": "assistant", |
| "content": "".join(content), |
| } |
| if reasoning: |
| msg["reasoning"] = "".join(reasoning) |
|
|
| total_chars = len(msg["content"]) + len(msg.get("reasoning", "")) |
|
|
| return { |
| "id": _cid(), |
| "object": "chat.completion", |
| "created": int(time.time()), |
| "model": model_name, |
| "choices": [{ |
| "index": 0, |
| "message": msg, |
| "finish_reason": "stop", |
| }], |
| "usage": { |
| "prompt_tokens": 0, |
| "completion_tokens": total_chars // 4, |
| "total_tokens": total_chars // 4, |
| }, |
| } |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run( |
| "app:app", |
| host="0.0.0.0", |
| port=7860, |
| workers=1, |
| timeout_keep_alive=120, |
| log_level="info", |
| ) |