Adarshu07 commited on
Commit
209e6a4
·
verified ·
1 Parent(s): d07254b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +718 -0
app.py ADDED
@@ -0,0 +1,718 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ══════════════════════════════════════════════════════════════════
3
+ ⚡ DevsDo API Server v1.0.0
4
+
5
+ OpenAI-compatible · 52 Models · Cloudflare AI Backend
6
+ SSE Streaming · <think> Reasoning · Zero API Keys
7
+
8
+ Sections
9
+ ────────
10
+ §1 Logging
11
+ §2 Model Registry (g4f-style)
12
+ §3 Register All 52 Models
13
+ §4 Think-Tag Stream Parser
14
+ §5 Backend Client (SSE → raw tokens)
15
+ §6 FastAPI App + Lifespan
16
+ §7 Pydantic Schemas
17
+ §8 Routes
18
+ §9 Stream Generator (tokens → OpenAI SSE)
19
+ §10 Non-Stream Collector
20
+ §11 Entrypoint
21
+ ══════════════════════════════════════════════════════════════════
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import json, time, uuid, asyncio, random, logging
27
+ from contextlib import asynccontextmanager
28
+ from dataclasses import dataclass, asdict
29
+ from typing import Optional, AsyncGenerator, Dict, List, Any
30
+
31
+ import aiohttp
32
+ import aiohttp.resolver
33
+ from fastapi import FastAPI, HTTPException
34
+ from fastapi.responses import StreamingResponse
35
+ from fastapi.middleware.cors import CORSMiddleware
36
+ from pydantic import BaseModel, Field
37
+
38
+
39
+ # ═══════════════════════════════════════════════════════════
40
+ # §1 — LOGGING
41
+ # ═══════════════════════════════════════════════════════════
42
+
43
+ logging.basicConfig(
44
+ level=logging.INFO,
45
+ format="%(asctime)s │ %(levelname)-7s │ %(message)s",
46
+ datefmt="%H:%M:%S",
47
+ )
48
+ log = logging.getLogger("devsdo")
49
+
50
+
51
+ # ═══════════════════════════════════════════════════════════
52
+ # §2 — MODEL REGISTRY (g4f-style)
53
+ #
54
+ # Each model carries:
55
+ # name – short route alias ("deepseek-r1")
56
+ # real_name – human display name ("DeepSeek R1 Distill Qwen 32B")
57
+ # author – organisation ("DeepSeek")
58
+ # family – model family group ("DeepSeek")
59
+ # model_id – backend @cf/@hf ID ("@cf/deepseek-ai/…")
60
+ # ═══════════════════════════════════════════════════════════
61
+
62
+ @dataclass(frozen=True, slots=True)
63
+ class ModelCard:
64
+ name: str
65
+ real_name: str
66
+ author: str
67
+ family: str
68
+ model_id: str
69
+
70
+
71
+ class Registry:
72
+ """Central model store — register once, resolve anywhere."""
73
+
74
+ _by_name: Dict[str, ModelCard] = {}
75
+ _by_id: Dict[str, ModelCard] = {}
76
+ _default: str = ""
77
+
78
+ # ── mutators ──────────────────────────────────────
79
+ @classmethod
80
+ def add(cls, *cards: ModelCard):
81
+ for c in cards:
82
+ cls._by_name[c.name] = c
83
+ cls._by_id[c.model_id] = c
84
+ if not cls._default:
85
+ cls._default = c.name
86
+
87
+ # ── lookups ───────────────────────────────────────
88
+ @classmethod
89
+ def resolve(cls, raw: Optional[str]) -> str:
90
+ """Alias / full-id / fuzzy → backend model_id."""
91
+ if not raw:
92
+ return cls._by_name[cls._default].model_id
93
+ raw = raw.strip()
94
+ for pfx in ("devsdo/", "devsdo:", "cloudflare/", "cf/"):
95
+ if raw.lower().startswith(pfx):
96
+ raw = raw[len(pfx):]
97
+ break
98
+ if raw.startswith(("@cf/", "@hf/")):
99
+ return raw
100
+ if raw in cls._by_name:
101
+ return cls._by_name[raw].model_id
102
+ low = raw.lower()
103
+ for alias, card in cls._by_name.items():
104
+ if low in alias or low in card.model_id.lower():
105
+ return card.model_id
106
+ return raw # pass-through
107
+
108
+ @classmethod
109
+ def find(cls, raw: str) -> Optional[ModelCard]:
110
+ mid = cls.resolve(raw)
111
+ return cls._by_id.get(mid) or cls._by_name.get(raw)
112
+
113
+ @classmethod
114
+ def all_cards(cls) -> List[ModelCard]:
115
+ return list(cls._by_name.values())
116
+
117
+ # ── serialisers ───────────────────────────────────
118
+ @classmethod
119
+ def openai_list(cls) -> dict:
120
+ """GET /v1/models — OpenAI-compatible."""
121
+ return {
122
+ "object": "list",
123
+ "data": [
124
+ {
125
+ "id": c.name,
126
+ "object": "model",
127
+ "created": 1700000000,
128
+ "owned_by": c.author.lower().replace(" ", "-"),
129
+ }
130
+ for c in cls._by_name.values()
131
+ ],
132
+ }
133
+
134
+ @classmethod
135
+ def internal_list(cls) -> dict:
136
+ """GET /api/internal/v1/models — rich, grouped by family."""
137
+ fam: Dict[str, list] = {}
138
+ for c in cls._by_name.values():
139
+ fam.setdefault(c.family, []).append(
140
+ {
141
+ "id": c.name,
142
+ "name": c.real_name,
143
+ "author": c.author,
144
+ "backend_id": c.model_id,
145
+ }
146
+ )
147
+ return {
148
+ "server": "DevsDo API",
149
+ "version": "1.0.0",
150
+ "timestamp": int(time.time()),
151
+ "total": len(cls._by_name),
152
+ "families": [
153
+ {"family": fn, "count": len(ms), "models": ms}
154
+ for fn, ms in fam.items()
155
+ ],
156
+ }
157
+
158
+
159
+ # ═══════════════════════════════════════════════════════════
160
+ # §3 — REGISTER ALL 52 MODELS
161
+ # ═══════════════════════════════════════════════════════════
162
+
163
+ Registry.add(
164
+ # ─── Flagship / Large ─────────────────────────────────
165
+ ModelCard("kimi-k2.5", "Kimi K2.5", "Moonshot AI", "Kimi", "@cf/moonshotai/kimi-k2.5"),
166
+ ModelCard("nemotron-120b", "Nemotron 3 120B A12B", "NVIDIA", "Nemotron", "@cf/nvidia/nemotron-3-120b-a12b"),
167
+ ModelCard("gpt-oss-120b", "GPT-OSS 120B", "OpenAI", "GPT-OSS", "@cf/openai/gpt-oss-120b"),
168
+ ModelCard("gpt-oss-20b", "GPT-OSS 20B", "OpenAI", "GPT-OSS", "@cf/openai/gpt-oss-20b"),
169
+ ModelCard("llama-3.3-70b", "LLaMA 3.3 70B Instruct FP8", "Meta", "LLaMA", "@cf/meta/llama-3.3-70b-instruct-fp8-fast"),
170
+
171
+ # ─── Meta LLaMA ───────────────────────────────────────
172
+ ModelCard("llama-4-scout", "LLaMA 4 Scout 17B 16E", "Meta", "LLaMA", "@cf/meta/llama-4-scout-17b-16e-instruct"),
173
+ ModelCard("llama-3.2-11b-vision","LLaMA 3.2 11B Vision", "Meta", "LLaMA", "@cf/meta/llama-3.2-11b-vision-instruct"),
174
+ ModelCard("llama-3.1-8b", "LLaMA 3.1 8B Fast", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fast"),
175
+ ModelCard("llama-3.1-8b-fp8", "LLaMA 3.1 8B FP8", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-fp8"),
176
+ ModelCard("llama-3.1-8b-awq", "LLaMA 3.1 8B AWQ", "Meta", "LLaMA", "@cf/meta/llama-3.1-8b-instruct-awq"),
177
+ ModelCard("llama-3.2-3b", "LLaMA 3.2 3B", "Meta", "LLaMA", "@cf/meta/llama-3.2-3b-instruct"),
178
+ ModelCard("llama-3.2-1b", "LLaMA 3.2 1B", "Meta", "LLaMA", "@cf/meta/llama-3.2-1b-instruct"),
179
+ ModelCard("llama-3-8b", "LLaMA 3 8B", "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct"),
180
+ ModelCard("llama-3-8b-awq", "LLaMA 3 8B AWQ", "Meta", "LLaMA", "@cf/meta/llama-3-8b-instruct-awq"),
181
+ ModelCard("llama-guard-3", "LLaMA Guard 3 8B", "Meta", "LLaMA", "@cf/meta/llama-guard-3-8b"),
182
+ ModelCard("llama-2-7b-fp16", "LLaMA 2 7B FP16", "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-fp16"),
183
+ ModelCard("llama-2-7b-int8", "LLaMA 2 7B INT8", "Meta", "LLaMA", "@cf/meta/llama-2-7b-chat-int8"),
184
+ ModelCard("llama-2-7b-lora", "LLaMA 2 7B LoRA", "Meta", "LLaMA", "@cf/meta-llama/llama-2-7b-chat-hf-lora"),
185
+ ModelCard("llama-2-13b", "LLaMA 2 13B AWQ", "Meta", "LLaMA", "@hf/thebloke/llama-2-13b-chat-awq"),
186
+
187
+ # ─── Qwen ─────────────────────────────────────────────
188
+ ModelCard("qwq-32b", "QwQ 32B", "Qwen", "Qwen", "@cf/qwen/qwq-32b"),
189
+ ModelCard("qwen-coder-32b", "Qwen 2.5 Coder 32B", "Qwen", "Qwen", "@cf/qwen/qwen2.5-coder-32b-instruct"),
190
+ ModelCard("qwen3-30b", "Qwen 3 30B A3B FP8", "Qwen", "Qwen", "@cf/qwen/qwen3-30b-a3b-fp8"),
191
+ ModelCard("qwen1.5-14b", "Qwen 1.5 14B AWQ", "Qwen", "Qwen", "@cf/qwen/qwen1.5-14b-chat-awq"),
192
+ ModelCard("qwen1.5-7b", "Qwen 1.5 7B AWQ", "Qwen", "Qwen", "@cf/qwen/qwen1.5-7b-chat-awq"),
193
+ ModelCard("qwen1.5-1.8b", "Qwen 1.5 1.8B", "Qwen", "Qwen", "@cf/qwen/qwen1.5-1.8b-chat"),
194
+ ModelCard("qwen1.5-0.5b", "Qwen 1.5 0.5B", "Qwen", "Qwen", "@cf/qwen/qwen1.5-0.5b-chat"),
195
+
196
+ # ─── DeepSeek ──────────────────────────────────────���──
197
+ ModelCard("deepseek-r1", "DeepSeek R1 Distill Qwen 32B", "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-r1-distill-qwen-32b"),
198
+ ModelCard("deepseek-math", "DeepSeek Math 7B", "DeepSeek", "DeepSeek", "@cf/deepseek-ai/deepseek-math-7b-instruct"),
199
+ ModelCard("deepseek-coder-base", "DeepSeek Coder 6.7B Base", "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-base-awq"),
200
+ ModelCard("deepseek-coder", "DeepSeek Coder 6.7B Instruct", "DeepSeek", "DeepSeek", "@hf/thebloke/deepseek-coder-6.7b-instruct-awq"),
201
+
202
+ # ─── Google Gemma ─────────────────────────────────────
203
+ ModelCard("gemma-3-12b", "Gemma 3 12B IT", "Google", "Gemma", "@cf/google/gemma-3-12b-it"),
204
+ ModelCard("gemma-7b", "Gemma 7B IT", "Google", "Gemma", "@hf/google/gemma-7b-it"),
205
+ ModelCard("gemma-2b-lora", "Gemma 2B IT LoRA", "Google", "Gemma", "@cf/google/gemma-2b-it-lora"),
206
+ ModelCard("gemma-7b-lora", "Gemma 7B IT LoRA", "Google", "Gemma", "@cf/google/gemma-7b-it-lora"),
207
+
208
+ # ─── Mistral ──────────────────────────────────────────
209
+ ModelCard("mistral-small-3.1", "Mistral Small 3.1 24B", "Mistral AI", "Mistral", "@cf/mistralai/mistral-small-3.1-24b-instruct"),
210
+ ModelCard("mistral-v0.2", "Mistral 7B v0.2", "Mistral AI", "Mistral", "@hf/mistral/mistral-7b-instruct-v0.2"),
211
+ ModelCard("mistral-v0.2-lora", "Mistral 7B v0.2 LoRA", "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.2-lora"),
212
+ ModelCard("mistral-v0.1", "Mistral 7B v0.1", "Mistral AI", "Mistral", "@cf/mistral/mistral-7b-instruct-v0.1"),
213
+ ModelCard("mistral-v0.1-awq", "Mistral 7B v0.1 AWQ", "Mistral AI", "Mistral", "@hf/thebloke/mistral-7b-instruct-v0.1-awq"),
214
+
215
+ # ─── IBM Granite ──────────────────────────────────────
216
+ ModelCard("granite-4.0", "Granite 4.0 H Micro", "IBM", "Granite", "@cf/ibm-granite/granite-4.0-h-micro"),
217
+
218
+ # ─── ZhipuAI GLM ─────────────────────────────────────
219
+ ModelCard("glm-4.7-flash", "GLM 4.7 Flash", "ZhipuAI", "GLM", "@cf/zai-org/glm-4.7-flash"),
220
+
221
+ # ─── AI Singapore ─────────────────────────────────────
222
+ ModelCard("sea-lion-27b", "SEA-LION v4 27B", "AI Singapore", "SEA-LION", "@cf/aisingapore/gemma-sea-lion-v4-27b-it"),
223
+
224
+ # ─── Community / Other ────────────────────────────────
225
+ ModelCard("hermes-2-pro", "Hermes 2 Pro Mistral 7B", "NousResearch", "Hermes", "@hf/nousresearch/hermes-2-pro-mistral-7b"),
226
+ ModelCard("openhermes-2.5", "OpenHermes 2.5 Mistral 7B", "NousResearch", "Hermes", "@hf/thebloke/openhermes-2.5-mistral-7b-awq"),
227
+ ModelCard("starling-7b", "Starling LM 7B Beta", "Nexusflow", "Starling", "@hf/nexusflow/starling-lm-7b-beta"),
228
+ ModelCard("neural-chat-7b", "Neural Chat 7B v3.1", "Intel", "Neural Chat", "@hf/thebloke/neural-chat-7b-v3-1-awq"),
229
+ ModelCard("openchat-3.5", "OpenChat 3.5", "OpenChat", "OpenChat", "@cf/openchat/openchat-3.5-0106"),
230
+ ModelCard("cybertron-7b", "UNA Cybertron 7B v2", "fblgit", "Cybertron", "@cf/fblgit/una-cybertron-7b-v2-bf16"),
231
+ ModelCard("discolm-german-7b", "DiscoLM German 7B", "TheBloke", "DiscoLM", "@cf/thebloke/discolm-german-7b-v1-awq"),
232
+ ModelCard("zephyr-7b", "Zephyr 7B Beta", "HuggingFace", "Zephyr", "@hf/thebloke/zephyr-7b-beta-awq"),
233
+ ModelCard("falcon-7b", "Falcon 7B Instruct", "TII UAE", "Falcon", "@cf/tiiuae/falcon-7b-instruct"),
234
+ ModelCard("tinyllama-1.1b", "TinyLlama 1.1B Chat", "TinyLlama", "TinyLlama", "@cf/tinyllama/tinyllama-1.1b-chat-v1.0"),
235
+ ModelCard("phi-2", "Phi 2", "Microsoft", "Phi", "@cf/microsoft/phi-2"),
236
+ ModelCard("sqlcoder", "SQLCoder 7B 2", "Defog", "SQLCoder", "@cf/defog/sqlcoder-7b-2"),
237
+ )
238
+
239
+
240
+ # ═══════════════════════════════════════════════════════════
241
+ # §4 — THINK-TAG STREAM PARSER
242
+ #
243
+ # Detects <think>…</think> across chunked tokens.
244
+ # Yields ("reasoning", text) or ("content", text).
245
+ # Handles tags split across multiple SSE tokens.
246
+ # ═══════════════════════════════════════════════════════════
247
+
248
+ class ThinkParser:
249
+ __slots__ = ("thinking", "buf")
250
+
251
+ OPEN = "<think>" # 7 chars
252
+ CLOSE = "</think>" # 8 chars
253
+
254
+ def __init__(self):
255
+ self.thinking = False
256
+ self.buf = ""
257
+
258
+ # ── feed one token, get classified fragments ──────
259
+ def feed(self, token: str) -> list[tuple[str, str]]:
260
+ self.buf += token
261
+ out: list[tuple[str, str]] = []
262
+
263
+ while self.buf:
264
+ tag = self.CLOSE if self.thinking else self.OPEN
265
+ kind = "reasoning" if self.thinking else "content"
266
+
267
+ idx = self.buf.find(tag)
268
+ if idx >= 0:
269
+ # full tag found — emit text before, flip state
270
+ if idx > 0:
271
+ out.append((kind, self.buf[:idx]))
272
+ self.buf = self.buf[idx + len(tag) :]
273
+ self.thinking = not self.thinking
274
+ continue
275
+
276
+ # no full tag — check for partial tag stuck at end
277
+ held = self._partial(tag)
278
+ if held:
279
+ safe = self.buf[: -len(held)]
280
+ if safe:
281
+ out.append((kind, safe))
282
+ self.buf = held
283
+ else:
284
+ out.append((kind, self.buf))
285
+ self.buf = ""
286
+ break
287
+
288
+ return out
289
+
290
+ # ── drain remaining buffer at stream end ──────────
291
+ def flush(self) -> list[tuple[str, str]]:
292
+ if not self.buf:
293
+ return []
294
+ kind = "reasoning" if self.thinking else "content"
295
+ r = [(kind, self.buf)]
296
+ self.buf = ""
297
+ return r
298
+
299
+ # ── helper: longest suffix of buf that is a prefix of tag
300
+ def _partial(self, tag: str) -> str:
301
+ for i in range(min(len(tag) - 1, len(self.buf)), 0, -1):
302
+ if self.buf[-i:] == tag[:i]:
303
+ return self.buf[-i:]
304
+ return ""
305
+
306
+
307
+ # ═══════════════════════════════════════════════════════════
308
+ # §5 — BACKEND CLIENT
309
+ #
310
+ # Talks to the Cloudflare AI proxy hosted on HF Spaces.
311
+ # Parses upstream SSE and yields raw string tokens.
312
+ # Retries on transient HTTP errors.
313
+ # ═══════════════════════════════════════════════════════════
314
+
315
+ _BACKEND = "https://adarshu07-ls.hf.space"
316
+ _BACKEND_URL = f"{_BACKEND}/v1/chat/completions"
317
+
318
+ _RETRYABLE = frozenset({429, 500, 502, 503, 504, 520, 521, 522, 523, 524})
319
+ _FATAL = frozenset({400, 401, 403, 404, 405, 422})
320
+
321
+ _BE_HEADERS = {
322
+ "Accept": "application/json",
323
+ "Accept-Encoding": "gzip, deflate, br",
324
+ "Content-Type": "application/json",
325
+ "Origin": _BACKEND,
326
+ "Referer": f"{_BACKEND}/docs",
327
+ "User-Agent": (
328
+ "Mozilla/5.0 (X11; Linux x86_64) "
329
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
330
+ "Chrome/131.0.0.0 Safari/537.36"
331
+ ),
332
+ }
333
+
334
+
335
+ def _parse_sse(line: str) -> tuple[str, bool]:
336
+ """One SSE data: line → (token_text, is_done)."""
337
+ line = line.strip()
338
+ if not line.startswith("data:"):
339
+ return "", False
340
+ payload = line[5:].strip()
341
+ if payload == "[DONE]":
342
+ return "", True
343
+ try:
344
+ obj = json.loads(payload)
345
+ if "error" in obj:
346
+ return "", True
347
+ delta = obj.get("choices", [{}])[0].get("delta", {})
348
+ return delta.get("content", "") or "", False
349
+ except (json.JSONDecodeError, KeyError, IndexError):
350
+ return "", False
351
+
352
+
353
+ async def backend_stream(
354
+ session: aiohttp.ClientSession,
355
+ messages: list[dict],
356
+ model_id: str,
357
+ temperature: float = 0.7,
358
+ max_tokens: int = 4096,
359
+ timeout: int = 180,
360
+ retries: int = 2,
361
+ ) -> AsyncGenerator[str, None]:
362
+ """POST → upstream, parse SSE, yield raw tokens."""
363
+
364
+ body: dict = {
365
+ "model": model_id,
366
+ "messages": messages,
367
+ "stream": True,
368
+ "temperature": temperature,
369
+ }
370
+ if max_tokens:
371
+ body["max_tokens"] = max_tokens
372
+
373
+ last_err = ""
374
+
375
+ for attempt in range(1 + retries):
376
+ try:
377
+ async with session.post(
378
+ _BACKEND_URL,
379
+ json=body,
380
+ timeout=aiohttp.ClientTimeout(
381
+ total=timeout,
382
+ sock_connect=30,
383
+ sock_read=timeout,
384
+ ),
385
+ ) as resp:
386
+
387
+ if resp.status == 200:
388
+ while True:
389
+ raw = await resp.content.readline()
390
+ if not raw:
391
+ break
392
+ line = raw.decode("utf-8", errors="replace")
393
+ if not line.strip():
394
+ continue
395
+ tok, done = _parse_sse(line)
396
+ if done:
397
+ return
398
+ if tok:
399
+ yield tok
400
+ return
401
+
402
+ text = await resp.text()
403
+ last_err = f"HTTP {resp.status}: {text[:300]}"
404
+
405
+ if resp.status in _FATAL:
406
+ raise RuntimeError(last_err)
407
+ if resp.status in _RETRYABLE and attempt < retries:
408
+ wait = min(2.0 * (attempt + 1) + random.random(), 15)
409
+ log.warning(f"Retry {attempt+1}/{retries} in {wait:.1f}s — {last_err}")
410
+ await asyncio.sleep(wait)
411
+ continue
412
+ raise RuntimeError(last_err)
413
+
414
+ except (RuntimeError, GeneratorExit):
415
+ raise
416
+ except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as exc:
417
+ last_err = str(exc)
418
+ if attempt < retries:
419
+ log.warning(f"Retry {attempt+1}/{retries} — {last_err}")
420
+ await asyncio.sleep(1.5 * (attempt + 1))
421
+ continue
422
+ raise RuntimeError(f"Backend unreachable: {last_err}") from exc
423
+
424
+ raise RuntimeError(f"All retries exhausted: {last_err}")
425
+
426
+
427
+ # ═══════════════════════════════════════════════════════════
428
+ # §6 — FASTAPI APP + LIFESPAN
429
+ # ═══════════════════════════════════════════════════════════
430
+
431
+ @asynccontextmanager
432
+ async def lifespan(app: FastAPI):
433
+ # ── startup ───────────────────────────────────────
434
+ connector = aiohttp.TCPConnector(
435
+ resolver=aiohttp.resolver.ThreadedResolver(),
436
+ limit=100,
437
+ limit_per_host=15,
438
+ ttl_dns_cache=300,
439
+ keepalive_timeout=60,
440
+ enable_cleanup_closed=True,
441
+ )
442
+ app.state.http = aiohttp.ClientSession(
443
+ connector=connector,
444
+ headers=_BE_HEADERS,
445
+ )
446
+ log.info("══════════════════════════════════════════")
447
+ log.info(" ⚡ DevsDo API Server v1.0.0")
448
+ log.info(f" Models : {len(Registry.all_cards())}")
449
+ log.info(f" Backend: {_BACKEND}")
450
+ log.info(f" Port : 7860")
451
+ log.info("══════════════════════════════════════════")
452
+ yield
453
+ # ── shutdown ──────────────────────────────────────
454
+ await app.state.http.close()
455
+ log.info("Server stopped ✓")
456
+
457
+
458
+ app = FastAPI(
459
+ title="⚡ DevsDo API",
460
+ description="OpenAI-compatible · 52 Models · Streaming · Reasoning",
461
+ version="1.0.0",
462
+ docs_url="/docs",
463
+ redoc_url="/redoc",
464
+ lifespan=lifespan,
465
+ )
466
+
467
+ app.add_middleware(
468
+ CORSMiddleware,
469
+ allow_origins=["*"],
470
+ allow_credentials=True,
471
+ allow_methods=["*"],
472
+ allow_headers=["*"],
473
+ )
474
+
475
+
476
+ # ═══════════════════════════════════════════════════════════
477
+ # §7 — PYDANTIC SCHEMAS
478
+ # ═══════════════════════════════════════════════════════════
479
+
480
+ class Message(BaseModel):
481
+ role: str
482
+ content: str
483
+
484
+ class ChatRequest(BaseModel):
485
+ model: str = "kimi-k2.5"
486
+ messages: list[Message] = Field(..., min_length=1)
487
+ stream: bool = False
488
+ temperature: float = Field(default=0.7, ge=0.0, le=2.0)
489
+ max_tokens: Optional[int] = Field(default=4096, ge=1)
490
+
491
+
492
+ # ═══════════════════════════════════════════════════════════
493
+ # §8 — ROUTES
494
+ # ═══════════════════════════════════════════════════════════
495
+
496
+ def _cid() -> str:
497
+ """Generate a chat-completion ID."""
498
+ return f"chatcmpl-{uuid.uuid4().hex[:29]}"
499
+
500
+ def _sse(obj: Any) -> str:
501
+ """Format one SSE frame."""
502
+ return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n"
503
+
504
+
505
+ # ── info ──────────────────────────────────────────────────
506
+
507
+ @app.get("/")
508
+ async def root():
509
+ return {
510
+ "service": "⚡ DevsDo API",
511
+ "version": "1.0.0",
512
+ "status": "running",
513
+ "models": len(Registry.all_cards()),
514
+ "docs": "/docs",
515
+ "endpoints": {
516
+ "health": "GET /health",
517
+ "models_openai": "GET /v1/models",
518
+ "models_detail": "GET /api/internal/v1/models",
519
+ "chat": "POST /v1/chat/completions",
520
+ },
521
+ }
522
+
523
+
524
+ @app.get("/health")
525
+ async def health():
526
+ return {
527
+ "status": "healthy",
528
+ "timestamp": int(time.time()),
529
+ "models": len(Registry.all_cards()),
530
+ "backend": _BACKEND,
531
+ }
532
+
533
+
534
+ # ── models ────────────────────────────────────────────────
535
+
536
+ @app.get("/v1/models")
537
+ async def models_openai():
538
+ """OpenAI-compatible model list."""
539
+ return Registry.openai_list()
540
+
541
+
542
+ @app.get("/api/internal/v1/models")
543
+ async def models_internal():
544
+ """Rich model registry grouped by family."""
545
+ return Registry.internal_list()
546
+
547
+
548
+ # ── chat completions ─────────────────────────────────────
549
+
550
+ @app.post("/v1/chat/completions")
551
+ async def chat_completions(req: ChatRequest):
552
+ """
553
+ OpenAI-compatible chat completions.
554
+
555
+ • stream=false → JSON (reasoning in `reasoning_content`)
556
+ • stream=true → SSE (reasoning chunks use `reasoning_content` in delta)
557
+ """
558
+ model_id = Registry.resolve(req.model)
559
+ card = Registry.find(req.model)
560
+ display = card.name if card else req.model
561
+
562
+ msgs = [{"role": m.role, "content": m.content} for m in req.messages]
563
+
564
+ if req.stream:
565
+ return StreamingResponse(
566
+ _stream_gen(app.state.http, msgs, model_id, display,
567
+ req.temperature, req.max_tokens or 4096),
568
+ media_type="text/event-stream",
569
+ headers={
570
+ "Cache-Control": "no-cache",
571
+ "Connection": "keep-alive",
572
+ "X-Accel-Buffering": "no",
573
+ },
574
+ )
575
+
576
+ return await _complete(
577
+ app.state.http, msgs, model_id, display,
578
+ req.temperature, req.max_tokens or 4096,
579
+ )
580
+
581
+
582
+ # ═══════════════════════════════════════════════════════════
583
+ # §9 — SSE STREAM GENERATOR
584
+ #
585
+ # backend tokens → ThinkParser → OpenAI SSE chunks
586
+ #
587
+ # Reasoning tokens go into delta.reasoning_content
588
+ # Normal tokens go into delta.content
589
+ # ═══════════════════════════════════════════════════════════
590
+
591
+ async def _stream_gen(
592
+ session: aiohttp.ClientSession,
593
+ messages: list[dict],
594
+ model_id: str,
595
+ model_name: str,
596
+ temperature: float,
597
+ max_tokens: int,
598
+ ) -> AsyncGenerator[str, None]:
599
+
600
+ cid = _cid()
601
+ ts = int(time.time())
602
+ parser = ThinkParser()
603
+
604
+ def _chunk(delta: dict, finish: Optional[str] = None) -> str:
605
+ return _sse({
606
+ "id": cid,
607
+ "object": "chat.completion.chunk",
608
+ "created": ts,
609
+ "model": model_name,
610
+ "choices": [{
611
+ "index": 0,
612
+ "delta": delta,
613
+ "finish_reason": finish,
614
+ }],
615
+ })
616
+
617
+ # ── role announcement ─────────────────────────────
618
+ yield _chunk({"role": "assistant"})
619
+
620
+ try:
621
+ async for token in backend_stream(
622
+ session, messages, model_id, temperature, max_tokens,
623
+ ):
624
+ for kind, text in parser.feed(token):
625
+ if kind == "reasoning":
626
+ yield _chunk({"reasoning_content": text})
627
+ else:
628
+ yield _chunk({"content": text})
629
+
630
+ # ── flush parser buffer ───────────────────────
631
+ for kind, text in parser.flush():
632
+ if kind == "reasoning":
633
+ yield _chunk({"reasoning_content": text})
634
+ else:
635
+ yield _chunk({"content": text})
636
+
637
+ # ── stop ──────────────────────────────────────
638
+ yield _chunk({}, finish="stop")
639
+ yield "data: [DONE]\n\n"
640
+
641
+ except Exception as exc:
642
+ log.error(f"Stream error [{model_name}]: {exc}")
643
+ yield _chunk({"content": f"\n\n[Error: {exc}]"}, finish="error")
644
+ yield "data: [DONE]\n\n"
645
+
646
+
647
+ # ═══════════════════════════════════════════════════════════
648
+ # §10 — NON-STREAMING COLLECTOR
649
+ # ═══════════════════════════════════════════════════════════
650
+
651
+ async def _complete(
652
+ session: aiohttp.ClientSession,
653
+ messages: list[dict],
654
+ model_id: str,
655
+ model_name: str,
656
+ temperature: float,
657
+ max_tokens: int,
658
+ ) -> dict:
659
+ """Collect full response, separate reasoning vs content."""
660
+
661
+ parser = ThinkParser()
662
+ reasoning: list[str] = []
663
+ content: list[str] = []
664
+
665
+ try:
666
+ async for token in backend_stream(
667
+ session, messages, model_id, temperature, max_tokens,
668
+ ):
669
+ for kind, text in parser.feed(token):
670
+ (reasoning if kind == "reasoning" else content).append(text)
671
+
672
+ for kind, text in parser.flush():
673
+ (reasoning if kind == "reasoning" else content).append(text)
674
+
675
+ except Exception as exc:
676
+ raise HTTPException(status_code=502, detail=f"Backend error: {exc}")
677
+
678
+ msg: dict = {
679
+ "role": "assistant",
680
+ "content": "".join(content),
681
+ }
682
+ if reasoning:
683
+ msg["reasoning_content"] = "".join(reasoning)
684
+
685
+ total_chars = len(msg["content"]) + len(msg.get("reasoning_content", ""))
686
+
687
+ return {
688
+ "id": _cid(),
689
+ "object": "chat.completion",
690
+ "created": int(time.time()),
691
+ "model": model_name,
692
+ "choices": [{
693
+ "index": 0,
694
+ "message": msg,
695
+ "finish_reason": "stop",
696
+ }],
697
+ "usage": {
698
+ "prompt_tokens": 0,
699
+ "completion_tokens": total_chars // 4, # rough estimate
700
+ "total_tokens": total_chars // 4,
701
+ },
702
+ }
703
+
704
+
705
+ # ═══════════════════════════════════════════════════════════
706
+ # §11 — ENTRYPOINT
707
+ # ═══════════════════════════════════════════════════════════
708
+
709
+ if __name__ == "__main__":
710
+ import uvicorn
711
+ uvicorn.run(
712
+ "app:app",
713
+ host="0.0.0.0",
714
+ port=7860,
715
+ workers=1,
716
+ timeout_keep_alive=120,
717
+ log_level="info",
718
+ )