MB-IDK commited on
Commit
a8ad738
Β·
verified Β·
1 Parent(s): 4d0abc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1212 -0
app.py CHANGED
@@ -0,0 +1,1212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Multi-Model AI API β€” HuggingFace Spaces Edition
4
+ Unified API gateway for multiple AI models via Hugging Face Spaces.
5
+ """
6
+
7
+ import re, os, json, uuid, time, random, string, logging, threading
8
+ from abc import ABC, abstractmethod
9
+ from collections import deque
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Dict, Generator, List, Optional, Tuple, Union
12
+
13
+ import requests
14
+ from flask import Flask, request as freq, jsonify, Response, stream_with_context
15
+
16
+ try:
17
+ from gradio_client import Client as GradioClient
18
+ HAS_GRADIO_CLIENT = True
19
+ except ImportError:
20
+ HAS_GRADIO_CLIENT = False
21
+
22
+ # ═══════════════════════════════════════════════════════════════
23
+ # CONFIG & CONSTANTS
24
+ # ═══════════════════════════════════════════════════════════════
25
+
26
+ VERSION = "2.2.0-hf"
27
+ APP_NAME = "Multi-Model-AI-API"
28
+ DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
29
+ DEFAULT_MODEL = "gpt-oss-120b"
30
+
31
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
32
+ log = logging.getLogger(APP_NAME)
33
+
34
+ USER_AGENTS = [
35
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/144.0.0.0 Safari/537.36",
36
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 Safari/605.1.15",
37
+ ]
38
+
39
+ # ═══════════════════════════════════════════════════════════════
40
+ # MODEL REGISTRY
41
+ # ═══════════════════════════════════════════════════════════════
42
+
43
+ @dataclass
44
+ class ModelDef:
45
+ model_id: str
46
+ display_name: str
47
+ provider_type: str
48
+ space_id: str
49
+ owned_by: str
50
+ description: str = ""
51
+ supports_system_prompt: bool = True
52
+ supports_temperature: bool = True
53
+ supports_streaming: bool = True
54
+ supports_history: bool = True
55
+ supports_vision: bool = False
56
+ supports_thinking: bool = False
57
+ thinking_default: bool = True
58
+ max_tokens_default: int = 4096
59
+ default_temperature: float = 0.7
60
+ fn_index: Optional[int] = None
61
+ api_name: Optional[str] = None
62
+ extra_params: Dict[str, Any] = field(default_factory=dict)
63
+ clean_analysis: bool = False
64
+
65
+ MODEL_REGISTRY: Dict[str, ModelDef] = {}
66
+
67
+ def register_model(m: ModelDef):
68
+ MODEL_REGISTRY[m.model_id] = m
69
+
70
+ def _init_registry():
71
+ register_model(ModelDef(
72
+ model_id="gpt-oss-120b", display_name="AMD GPT-OSS-120B",
73
+ provider_type="gradio_sse", space_id="https://amd-gpt-oss-120b-chatbot.hf.space",
74
+ owned_by="amd", description="AMD open-source 120B model",
75
+ fn_index=8, clean_analysis=True, default_temperature=0.0,
76
+ supports_vision=False, supports_thinking=False,
77
+ ))
78
+ register_model(ModelDef(
79
+ model_id="command-a-vision", display_name="Cohere Command-A Vision",
80
+ provider_type="gradio_client", space_id="CohereLabs/command-a-vision",
81
+ owned_by="cohere", description="Cohere multimodal command model",
82
+ api_name="/chat", supports_vision=True, supports_system_prompt=False,
83
+ supports_temperature=False, supports_streaming=False, supports_history=False,
84
+ supports_thinking=False, max_tokens_default=700,
85
+ extra_params={"max_new_tokens": 700},
86
+ ))
87
+ register_model(ModelDef(
88
+ model_id="command-a-translate", display_name="Cohere Command-A Translate",
89
+ provider_type="gradio_client", space_id="CohereLabs/command-a-translate",
90
+ owned_by="cohere", description="Cohere translation model",
91
+ api_name="/chat", supports_vision=False, supports_system_prompt=False,
92
+ supports_temperature=False, supports_streaming=False, supports_history=False,
93
+ supports_thinking=False, max_tokens_default=700,
94
+ extra_params={"max_new_tokens": 700},
95
+ ))
96
+ register_model(ModelDef(
97
+ model_id="minimax-vl-01", display_name="MiniMax VL-01",
98
+ provider_type="gradio_client", space_id="MiniMaxAI/MiniMax-VL-01",
99
+ owned_by="minimax", description="MiniMax vision-language model",
100
+ api_name="/chat", supports_vision=True, supports_system_prompt=False,
101
+ supports_temperature=True, supports_streaming=False, supports_history=False,
102
+ supports_thinking=False, max_tokens_default=12800, default_temperature=0.1,
103
+ extra_params={"max_tokens": 12800, "top_p": 0.9},
104
+ ))
105
+ register_model(ModelDef(
106
+ model_id="glm-4.5", display_name="GLM-4.5 (ZhipuAI)",
107
+ provider_type="gradio_client", space_id="zai-org/GLM-4.5-Space",
108
+ owned_by="zhipuai", description="ZhipuAI GLM-4.5 with thinking mode",
109
+ api_name="/chat_wrapper", supports_vision=False, supports_system_prompt=True,
110
+ supports_temperature=True, supports_streaming=False, supports_history=False,
111
+ supports_thinking=True, thinking_default=True, default_temperature=1.0,
112
+ extra_params={"thinking_enabled": True},
113
+ ))
114
+ register_model(ModelDef(
115
+ model_id="chatgpt", display_name="ChatGPT (Community)",
116
+ provider_type="gradio_client", space_id="yuntian-deng/ChatGPT",
117
+ owned_by="community", description="ChatGPT via community Space",
118
+ api_name="/predict", supports_vision=False, supports_system_prompt=False,
119
+ supports_temperature=True, supports_streaming=False, supports_history=True,
120
+ supports_thinking=False, default_temperature=1.0,
121
+ extra_params={"top_p": 1.0},
122
+ ))
123
+ register_model(ModelDef(
124
+ model_id="qwen3-vl", display_name="Qwen3-VL (Alibaba)",
125
+ provider_type="gradio_client", space_id="Qwen/Qwen3-VL-Demo",
126
+ owned_by="alibaba", description="Alibaba Qwen3 Vision-Language model",
127
+ api_name="/add_message", supports_vision=True, supports_system_prompt=False,
128
+ supports_temperature=False, supports_streaming=False, supports_history=False,
129
+ supports_thinking=False, max_tokens_default=4096,
130
+ ))
131
+
132
+ _init_registry()
133
+
134
+ # ═══════════════════════════════════════════════════════════════
135
+ # CONFIG
136
+ # ═══════════════════════════════════════════════════════════════
137
+
138
+ @dataclass
139
+ class Config:
140
+ default_model: str = DEFAULT_MODEL
141
+ default_system_prompt: str = DEFAULT_SYSTEM_PROMPT
142
+ timeout_stream: int = 300
143
+ max_retries: int = 3
144
+ retry_backoff_base: float = 1.5
145
+ retry_jitter: float = 0.5
146
+ rate_limit_rpm: int = 10
147
+ rate_limit_burst: int = 3
148
+ pool_size: int = 2
149
+ max_history_messages: int = 50
150
+ max_message_length: int = 10000
151
+ default_temperature: float = 0.7
152
+ include_thinking: bool = True
153
+ log_sse_raw: bool = False
154
+
155
+ @classmethod
156
+ def from_env(cls) -> "Config":
157
+ cfg = cls()
158
+ env_map = {
159
+ "MMAI_TIMEOUT": ("timeout_stream", int),
160
+ "MMAI_MAX_RETRIES": ("max_retries", int),
161
+ "MMAI_RATE_LIMIT": ("rate_limit_rpm", int),
162
+ "MMAI_POOL_SIZE": ("pool_size", int),
163
+ "MMAI_SYSTEM_PROMPT": ("default_system_prompt", str),
164
+ "MMAI_TEMPERATURE": ("default_temperature", float),
165
+ "MMAI_DEFAULT_MODEL": ("default_model", str),
166
+ "MMAI_INCLUDE_THINKING": ("include_thinking", lambda x: x.lower() in ("1", "true")),
167
+ }
168
+ for env_key, (attr, conv) in env_map.items():
169
+ val = os.environ.get(env_key)
170
+ if val is not None:
171
+ try:
172
+ setattr(cfg, attr, conv(val))
173
+ except (ValueError, TypeError):
174
+ pass
175
+ return cfg
176
+
177
+ # ═══════════════════════════════════════════════════════════════
178
+ # EXCEPTIONS
179
+ # ═══════════════════════════════════════════════════════════════
180
+
181
+ class APIError(Exception):
182
+ def __init__(self, message: str, code: str = "UNKNOWN", status: int = 500):
183
+ super().__init__(message)
184
+ self.code = code
185
+ self.status = status
186
+ def to_dict(self):
187
+ return {"error": str(self), "code": self.code}
188
+
189
+ class ModelNotFoundError(APIError):
190
+ def __init__(self, model_id: str):
191
+ super().__init__(f"Model '{model_id}' not found. Available: {list(MODEL_REGISTRY.keys())}", "MODEL_NOT_FOUND", 404)
192
+
193
+ # ═══════════════════════════════════════════════════════════════
194
+ # RESPONSE CLEANER
195
+ # ═══════════════════════════════════════════════════════════════
196
+
197
+ class ResponseCleaner:
198
+ @classmethod
199
+ def clean_analysis(cls, text: str) -> str:
200
+ if not text:
201
+ return text
202
+ original = text.strip()
203
+ for pattern in [
204
+ r'\*\*πŸ’¬\s*Response:\*\*\s*\n*(.*?)$',
205
+ r'\*\*Response:\*\*\s*\n*(.*?)$',
206
+ r'---+\s*\n*\*\*πŸ’¬\s*Response:\*\*\s*\n*(.*?)$',
207
+ ]:
208
+ match = re.search(pattern, original, re.DOTALL)
209
+ if match:
210
+ cleaned = match.group(1).strip()
211
+ if cleaned:
212
+ return cleaned
213
+ for pattern in [r'assistantfinal\s*(.*?)$', r'assistant\s*final\s*(.*?)$']:
214
+ match = re.search(pattern, original, re.DOTALL | re.IGNORECASE)
215
+ if match:
216
+ cleaned = match.group(1).strip()
217
+ if cleaned:
218
+ return cleaned
219
+ if re.match(r'^analysis', original, re.IGNORECASE):
220
+ return ""
221
+ return original
222
+
223
+ @classmethod
224
+ def _decode_html_entities(cls, text: str) -> str:
225
+ entities = {
226
+ ''': "'", ''': "'", ''': "'",
227
+ '"': '"', '"': '"', '"': '"',
228
+ '&amp;': '&', '&lt;': '<', '&gt;': '>',
229
+ '&nbsp;': ' ', '&rsquo;': '\u2019', '&lsquo;': '\u2018',
230
+ '&rdquo;': '\u201d', '&ldquo;': '\u201c',
231
+ '&mdash;': 'β€”', '&ndash;': '–', '&hellip;': '…',
232
+ }
233
+ for entity, char in entities.items():
234
+ text = text.replace(entity, char)
235
+ text = re.sub(r'&#x([0-9a-fA-F]+);', lambda m: chr(int(m.group(1), 16)), text)
236
+ text = re.sub(r'&#(\d+);', lambda m: chr(int(m.group(1))), text)
237
+ return text
238
+
239
+ @classmethod
240
+ def _strip_html(cls, text: str) -> str:
241
+ text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
242
+ text = re.sub(r'<[^>]+>', '', text)
243
+ return cls._decode_html_entities(text).strip()
244
+
245
+ @classmethod
246
+ def clean_glm(cls, text: str, include_thinking: bool = True) -> str:
247
+ if not text:
248
+ return text
249
+ if '<details' not in text and '<div' not in text:
250
+ return text.strip()
251
+ thinking_text = ""
252
+ thinking_match = re.search(r'<details[^>]*>.*?<div[^>]*>(.*?)</div>\s*</details>', text, re.DOTALL | re.IGNORECASE)
253
+ if thinking_match:
254
+ thinking_text = cls._strip_html(thinking_match.group(1)).strip()
255
+ text_without_details = re.sub(r'<details[^>]*>.*?</details>', '', text, flags=re.DOTALL | re.IGNORECASE).strip()
256
+ div_match = re.search(r"<div[^>]*>\s*(.*?)\s*</div>", text_without_details, re.DOTALL | re.IGNORECASE)
257
+ response_text = cls._strip_html(div_match.group(1)).strip() if div_match else cls._strip_html(text_without_details).strip()
258
+ if thinking_text and include_thinking:
259
+ return f"<thinking>\n{thinking_text}\n</thinking>\n{response_text}"
260
+ return response_text
261
+
262
+ @classmethod
263
+ def extract_qwen_text(cls, result: Any) -> str:
264
+ if result is None:
265
+ return ""
266
+ if isinstance(result, str):
267
+ return result.strip()
268
+ if isinstance(result, tuple):
269
+ for el in result:
270
+ if isinstance(el, dict):
271
+ value = el.get("value")
272
+ if isinstance(value, list):
273
+ for msg in reversed(value):
274
+ if isinstance(msg, dict) and msg.get("role") == "assistant":
275
+ content = msg.get("content", "")
276
+ if isinstance(content, str):
277
+ return content.strip()
278
+ if isinstance(content, list):
279
+ texts = []
280
+ for block in content:
281
+ if isinstance(block, str):
282
+ texts.append(block)
283
+ elif isinstance(block, dict) and block.get("type") != "file":
284
+ bc = block.get("content", "")
285
+ if isinstance(bc, str) and bc.strip():
286
+ texts.append(bc)
287
+ return "\n".join(t for t in texts if t.strip()).strip()
288
+ return str(content)
289
+ return str(result) if result else ""
290
+
291
+ @classmethod
292
+ def extract_chatgpt_text(cls, result: Any) -> str:
293
+ if isinstance(result, str):
294
+ return result.strip()
295
+ if isinstance(result, tuple) and len(result) >= 1:
296
+ chatbot = result[0]
297
+ if isinstance(chatbot, (list, tuple)) and chatbot:
298
+ last = chatbot[-1]
299
+ if isinstance(last, (list, tuple)) and len(last) >= 2:
300
+ msg = last[1]
301
+ if isinstance(msg, str):
302
+ return msg.strip()
303
+ if isinstance(msg, dict):
304
+ return str(msg.get("value", msg.get("content", ""))).strip()
305
+ return str(msg).strip() if msg else ""
306
+ return str(chatbot).strip() if chatbot else ""
307
+ return str(result)
308
+
309
+ @classmethod
310
+ def clean(cls, text: str, model_id: str = "", include_thinking: bool = True) -> str:
311
+ if not text:
312
+ return text
313
+ text = text.strip()
314
+ if model_id == "gpt-oss-120b":
315
+ text = cls.clean_analysis(text)
316
+ elif model_id == "glm-4.5":
317
+ text = cls.clean_glm(text, include_thinking=include_thinking)
318
+ if '&' in text and ';' in text:
319
+ text = cls._decode_html_entities(text)
320
+ return text.strip()
321
+
322
+ # ═══════════════════════════════════════════════════════════════
323
+ # THINKING PARSER
324
+ # ═══════════════════════════════════════════════════════════════
325
+
326
+ class ThinkingParser:
327
+ @staticmethod
328
+ def split(text: str) -> Tuple[Optional[str], str]:
329
+ match = re.match(r'\s*<thinking>\s*\n?(.*?)\n?\s*</thinking>\s*\n?(.*)', text, re.DOTALL | re.IGNORECASE)
330
+ if match:
331
+ thinking = match.group(1).strip()
332
+ response = match.group(2).strip()
333
+ return (thinking if thinking else None, response)
334
+ return (None, text.strip())
335
+
336
+ @staticmethod
337
+ def format(thinking: Optional[str], response: str) -> str:
338
+ if thinking:
339
+ return f"<thinking>\n{thinking}\n</thinking>\n{response}"
340
+ return response
341
+
342
+ # ═══════════════════════════════════════════════════════════════
343
+ # DATA MODELS
344
+ # ═══════════════════════════════════════════════════════════════
345
+
346
+ @dataclass
347
+ class Message:
348
+ role: str
349
+ content: str
350
+ thinking: Optional[str] = None
351
+ timestamp: float = field(default_factory=time.time)
352
+ message_id: str = field(default_factory=lambda: str(uuid.uuid4()))
353
+
354
+ @dataclass
355
+ class Conversation:
356
+ conversation_id: str = field(default_factory=lambda: str(uuid.uuid4()))
357
+ messages: List[Message] = field(default_factory=list)
358
+ created_at: float = field(default_factory=time.time)
359
+ updated_at: float = field(default_factory=time.time)
360
+ title: Optional[str] = None
361
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT
362
+ model_id: str = DEFAULT_MODEL
363
+
364
+ def add_message(self, role: str, content: str, max_messages: int = 50, thinking: Optional[str] = None) -> Message:
365
+ msg = Message(role=role, content=content, thinking=thinking)
366
+ self.messages.append(msg)
367
+ self.updated_at = time.time()
368
+ if self.title is None and role == "user":
369
+ self.title = content[:80]
370
+ if len(self.messages) > max_messages:
371
+ system_msgs = [m for m in self.messages if m.role == "system"]
372
+ other_msgs = [m for m in self.messages if m.role != "system"]
373
+ self.messages = system_msgs + other_msgs[-(max_messages - len(system_msgs)):]
374
+ return msg
375
+
376
+ def build_gradio_history(self) -> List[List[str]]:
377
+ history = []
378
+ non_system = [m for m in self.messages if m.role != "system"]
379
+ i = 0
380
+ while i < len(non_system) - 1:
381
+ if non_system[i].role == "user" and i + 1 < len(non_system) and non_system[i + 1].role == "assistant":
382
+ history.append([non_system[i].content, non_system[i + 1].content])
383
+ i += 2
384
+ else:
385
+ i += 1
386
+ return history
387
+
388
+ def build_chatbot_tuples(self) -> List[List[str]]:
389
+ return self.build_gradio_history()
390
+
391
+ def to_dict(self) -> Dict:
392
+ return {
393
+ "conversation_id": self.conversation_id, "title": self.title,
394
+ "model": self.model_id, "message_count": len(self.messages),
395
+ "created_at": self.created_at, "updated_at": self.updated_at,
396
+ }
397
+
398
+ # ═══════════════════════════════════════════════════════════════
399
+ # METRICS & RATE LIMITER
400
+ # ═══════════════════════════════════════════════════════════════
401
+
402
+ @dataclass
403
+ class Metrics:
404
+ _lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
405
+ total_requests: int = 0
406
+ successful_requests: int = 0
407
+ failed_requests: int = 0
408
+ total_retries: int = 0
409
+ total_chars_received: int = 0
410
+ active_streams: int = 0
411
+ requests_per_model: Dict[str, int] = field(default_factory=dict)
412
+ _latencies: deque = field(default_factory=lambda: deque(maxlen=1000), repr=False)
413
+ started_at: float = field(default_factory=time.time)
414
+
415
+ def record_request(self, success: bool, duration_ms: float, chars: int = 0, model: str = ""):
416
+ with self._lock:
417
+ self.total_requests += 1
418
+ if success:
419
+ self.successful_requests += 1
420
+ self.total_chars_received += chars
421
+ else:
422
+ self.failed_requests += 1
423
+ self._latencies.append(duration_ms)
424
+ if model:
425
+ self.requests_per_model[model] = self.requests_per_model.get(model, 0) + 1
426
+
427
+ def record_retry(self):
428
+ with self._lock:
429
+ self.total_retries += 1
430
+
431
+ def to_dict(self) -> Dict:
432
+ with self._lock:
433
+ avg = sum(self._latencies) / len(self._latencies) if self._latencies else 0
434
+ rate = self.successful_requests / self.total_requests if self.total_requests else 1
435
+ return {
436
+ "total_requests": self.total_requests, "successful": self.successful_requests,
437
+ "failed": self.failed_requests, "success_rate": round(rate, 4),
438
+ "retries": self.total_retries, "chars_received": self.total_chars_received,
439
+ "avg_latency_ms": round(avg, 1), "active_streams": self.active_streams,
440
+ "uptime_s": round(time.time() - self.started_at, 1),
441
+ "per_model": dict(self.requests_per_model),
442
+ }
443
+
444
+ metrics = Metrics()
445
+
446
+ class RateLimiter:
447
+ def __init__(self, rpm: int = 10, burst: int = 3):
448
+ self.rate = rpm / 60.0
449
+ self.max_tokens = float(burst)
450
+ self.tokens = float(burst)
451
+ self.last_refill = time.monotonic()
452
+ self._lock = threading.Lock()
453
+
454
+ def acquire(self, timeout: float = 30.0) -> bool:
455
+ deadline = time.monotonic() + timeout
456
+ while True:
457
+ with self._lock:
458
+ now = time.monotonic()
459
+ self.tokens = min(self.max_tokens, self.tokens + (now - self.last_refill) * self.rate)
460
+ self.last_refill = now
461
+ if self.tokens >= 1.0:
462
+ self.tokens -= 1.0
463
+ return True
464
+ if time.monotonic() >= deadline:
465
+ return False
466
+ time.sleep(0.1)
467
+
468
+ # ═══════════════════════════════════════════════════════════════
469
+ # CIRCUIT BREAKER
470
+ # ═══════════════════════════════════════════════════════════════
471
+
472
+ class CircuitBreaker:
473
+ def __init__(self, threshold: int = 5, recovery: int = 60):
474
+ self.threshold = threshold
475
+ self.recovery = recovery
476
+ self.state = "closed"
477
+ self.failures = 0
478
+ self.successes = 0
479
+ self.last_failure = 0.0
480
+ self._lock = threading.Lock()
481
+
482
+ def can_execute(self) -> bool:
483
+ with self._lock:
484
+ if self.state == "closed":
485
+ return True
486
+ if self.state == "open":
487
+ if time.time() - self.last_failure >= self.recovery:
488
+ self.state = "half_open"
489
+ return True
490
+ return False
491
+ return self.successes < 2
492
+
493
+ def record_success(self):
494
+ with self._lock:
495
+ if self.state == "half_open":
496
+ self.successes += 1
497
+ if self.successes >= 2:
498
+ self.state = "closed"
499
+ self.failures = 0
500
+ self.successes = 0
501
+ else:
502
+ self.failures = max(0, self.failures - 1)
503
+
504
+ def record_failure(self):
505
+ with self._lock:
506
+ self.failures += 1
507
+ self.last_failure = time.time()
508
+ if self.state == "half_open" or self.failures >= self.threshold:
509
+ self.state = "open"
510
+
511
+ # ═══════════════════════════════════════════════════════════════
512
+ # SSE PARSER (for GPT-OSS)
513
+ # ═══════════════════════════════════════════════════════════════
514
+
515
+ class GradioSSEParser:
516
+ @staticmethod
517
+ def parse_sse(response: requests.Response, log_raw: bool = False) -> Generator[Dict, None, None]:
518
+ buffer = ""
519
+ for chunk in response.iter_content(chunk_size=None, decode_unicode=True):
520
+ if chunk is None:
521
+ continue
522
+ buffer += chunk
523
+ while "\n" in buffer:
524
+ line, buffer = buffer.split("\n", 1)
525
+ line = line.strip()
526
+ if not line or not line.startswith("data:"):
527
+ continue
528
+ data_str = line[5:].strip()
529
+ if not data_str:
530
+ continue
531
+ try:
532
+ yield json.loads(data_str)
533
+ except json.JSONDecodeError:
534
+ continue
535
+
536
+ @staticmethod
537
+ def extract_text(output: Dict) -> str:
538
+ data = output.get("data", [])
539
+ if not data:
540
+ return ""
541
+ first = data[0]
542
+ if isinstance(first, str):
543
+ return first
544
+ if isinstance(first, list):
545
+ try:
546
+ if first and isinstance(first[0], list):
547
+ return str(first[0][-1])
548
+ except (IndexError, TypeError):
549
+ pass
550
+ return ""
551
+
552
+ # ═══════════════════════════════════════════════════════════════
553
+ # MODEL PROVIDERS
554
+ # ═══════════════════════════════════════════════════════════════
555
+
556
+ class ModelProvider(ABC):
557
+ def __init__(self, model_def: ModelDef, config: Config):
558
+ self.model_def = model_def
559
+ self.config = config
560
+ self.ready = False
561
+ self._lock = threading.Lock()
562
+
563
+ @abstractmethod
564
+ def initialize(self) -> bool: ...
565
+
566
+ @abstractmethod
567
+ def generate(self, message: str, history=None, system_prompt=None,
568
+ temperature=None, max_tokens=None, **kwargs) -> str: ...
569
+
570
+ def generate_stream(self, message: str, **kwargs) -> Generator[str, None, None]:
571
+ yield self.generate(message, **kwargs)
572
+
573
+
574
+ class GptOssProvider(ModelProvider):
575
+ def __init__(self, model_def, config):
576
+ super().__init__(model_def, config)
577
+ self._session = requests.Session()
578
+ self._rotate()
579
+
580
+ def _rotate(self):
581
+ self._session.headers.update({
582
+ "User-Agent": random.choice(USER_AGENTS),
583
+ "Accept-Language": "fr-FR,fr;q=0.9",
584
+ "Origin": "https://gptunlimited.org",
585
+ "Referer": "https://gptunlimited.org/",
586
+ })
587
+
588
+ def _hash(self):
589
+ return ''.join(random.choices(string.ascii_lowercase + string.digits, k=12))
590
+
591
+ def initialize(self) -> bool:
592
+ with self._lock:
593
+ if self.ready:
594
+ return True
595
+ self._rotate()
596
+ try:
597
+ r = self._session.get(f"{self.model_def.space_id}/gradio_api/info", timeout=15)
598
+ self.ready = r.status_code == 200
599
+ return self.ready
600
+ except:
601
+ return False
602
+
603
+ def generate(self, message, history=None, system_prompt=None, temperature=None, max_tokens=None, **kw):
604
+ if not self.ready:
605
+ self.initialize()
606
+ sys_p = system_prompt or self.config.default_system_prompt
607
+ temp = temperature if temperature is not None else self.model_def.default_temperature
608
+ h = self._hash()
609
+ payload = {"data": [message, history or [], sys_p, temp], "event_data": None,
610
+ "fn_index": self.model_def.fn_index, "trigger_id": None, "session_hash": h}
611
+ r = self._session.post(f"{self.model_def.space_id}/gradio_api/queue/join?",
612
+ json=payload, headers={"Content-Type": "application/json"}, timeout=30)
613
+ if r.status_code != 200:
614
+ raise APIError(f"Queue join failed: {r.status_code}")
615
+ data = r.json()
616
+ if not data.get("event_id"):
617
+ raise APIError(f"No event_id")
618
+
619
+ resp = self._session.get(f"{self.model_def.space_id}/gradio_api/queue/data",
620
+ params={"session_hash": h}, headers={"Accept": "text/event-stream"},
621
+ timeout=self.config.timeout_stream, stream=True)
622
+ full = ""
623
+ for d in GradioSSEParser.parse_sse(resp):
624
+ msg = d.get("msg", "")
625
+ if msg in ("process_generating", "process_completed"):
626
+ output = d.get("output", {})
627
+ if not output.get("success", True):
628
+ raise APIError(f"Gradio error: {output.get('error')}")
629
+ t = GradioSSEParser.extract_text(output)
630
+ if t:
631
+ full = t
632
+ if msg == "process_completed":
633
+ break
634
+ elif msg == "close_stream":
635
+ break
636
+ if not full.strip():
637
+ raise APIError("Empty response", "EMPTY")
638
+ return ResponseCleaner.clean_analysis(full) if self.model_def.clean_analysis else full
639
+
640
+ def generate_stream(self, message, history=None, system_prompt=None, temperature=None, max_tokens=None, **kw):
641
+ if not self.ready:
642
+ self.initialize()
643
+ sys_p = system_prompt or self.config.default_system_prompt
644
+ temp = temperature if temperature is not None else self.model_def.default_temperature
645
+ h = self._hash()
646
+ payload = {"data": [message, history or [], sys_p, temp], "event_data": None,
647
+ "fn_index": self.model_def.fn_index, "trigger_id": None, "session_hash": h}
648
+ self._session.post(f"{self.model_def.space_id}/gradio_api/queue/join?",
649
+ json=payload, headers={"Content-Type": "application/json"}, timeout=30)
650
+ resp = self._session.get(f"{self.model_def.space_id}/gradio_api/queue/data",
651
+ params={"session_hash": h}, headers={"Accept": "text/event-stream"},
652
+ timeout=self.config.timeout_stream, stream=True)
653
+ metrics.active_streams += 1
654
+ last = ""
655
+ try:
656
+ for d in GradioSSEParser.parse_sse(resp):
657
+ msg = d.get("msg", "")
658
+ if msg in ("process_generating", "process_completed"):
659
+ output = d.get("output", {})
660
+ if not output.get("success", True):
661
+ raise APIError(f"Gradio error")
662
+ raw = GradioSSEParser.extract_text(output)
663
+ if raw:
664
+ if self.model_def.clean_analysis:
665
+ cleaned = ResponseCleaner.clean_analysis(raw)
666
+ if cleaned and len(cleaned) > len(last):
667
+ yield cleaned[len(last):]
668
+ last = cleaned
669
+ else:
670
+ if len(raw) > len(last):
671
+ yield raw[len(last):]
672
+ last = raw
673
+ if msg == "process_completed":
674
+ return
675
+ elif msg == "close_stream":
676
+ return
677
+ finally:
678
+ metrics.active_streams = max(0, metrics.active_streams - 1)
679
+
680
+
681
+ class GradioClientProvider(ModelProvider):
682
+ """Generic provider for all gradio_client based models."""
683
+ def __init__(self, model_def, config):
684
+ super().__init__(model_def, config)
685
+ self._client = None
686
+ self._chat_counter = 0
687
+
688
+ def initialize(self) -> bool:
689
+ if not HAS_GRADIO_CLIENT:
690
+ raise APIError(f"gradio_client not installed", "MISSING_DEP")
691
+ with self._lock:
692
+ if self.ready:
693
+ return True
694
+ try:
695
+ log.info(f"Connecting to {self.model_def.space_id}...")
696
+ self._client = GradioClient(self.model_def.space_id)
697
+ self.ready = True
698
+ return True
699
+ except Exception as e:
700
+ log.error(f"Init failed for {self.model_def.model_id}: {e}")
701
+ return False
702
+
703
+ def generate(self, message, history=None, system_prompt=None, temperature=None, max_tokens=None, **kw):
704
+ if not self.ready:
705
+ self.initialize()
706
+ if not self._client:
707
+ raise APIError(f"{self.model_def.model_id} not initialized")
708
+
709
+ mid = self.model_def.model_id
710
+ try:
711
+ if mid == "command-a-vision":
712
+ max_new = max_tokens or self.model_def.extra_params.get("max_new_tokens", 700)
713
+ result = self._client.predict(message={"text": message, "files": []},
714
+ max_new_tokens=max_new, api_name=self.model_def.api_name)
715
+ elif mid == "command-a-translate":
716
+ max_new = max_tokens or self.model_def.extra_params.get("max_new_tokens", 700)
717
+ result = self._client.predict(message=message, max_new_tokens=max_new,
718
+ api_name=self.model_def.api_name)
719
+ elif mid == "minimax-vl-01":
720
+ temp = temperature if temperature is not None else self.model_def.default_temperature
721
+ max_tok = max_tokens or self.model_def.extra_params.get("max_tokens", 12800)
722
+ top_p = kw.get("top_p", self.model_def.extra_params.get("top_p", 0.9))
723
+ result = self._client.predict(message={"text": message, "files": []},
724
+ max_tokens=max_tok, temperature=temp, top_p=top_p,
725
+ api_name=self.model_def.api_name)
726
+ elif mid == "glm-4.5":
727
+ sys_p = system_prompt or self.config.default_system_prompt
728
+ temp = temperature if temperature is not None else self.model_def.default_temperature
729
+ thinking = kw.get("thinking_enabled", self.model_def.thinking_default)
730
+ include = kw.get("include_thinking", self.config.include_thinking)
731
+ result = self._client.predict(msg=message, sys_prompt=sys_p,
732
+ thinking_enabled=thinking, temperature=temp,
733
+ api_name=self.model_def.api_name)
734
+ return self._extract_glm(result, include)
735
+ elif mid == "chatgpt":
736
+ temp = temperature if temperature is not None else self.model_def.default_temperature
737
+ top_p = kw.get("top_p", self.model_def.extra_params.get("top_p", 1.0))
738
+ chat_hist = []
739
+ if history:
740
+ for pair in history:
741
+ if isinstance(pair, (list, tuple)) and len(pair) == 2:
742
+ chat_hist.append([str(pair[0]), str(pair[1])])
743
+ result = self._client.predict(inputs=message, top_p=top_p, temperature=temp,
744
+ chat_counter=self._chat_counter, chatbot=chat_hist,
745
+ api_name=self.model_def.api_name)
746
+ self._chat_counter += 1
747
+ return ResponseCleaner.extract_chatgpt_text(result)
748
+ elif mid == "qwen3-vl":
749
+ result = self._client.predict(input_value={"files": None, "text": message},
750
+ api_name="/add_message")
751
+ return ResponseCleaner.extract_qwen_text(result)
752
+ else:
753
+ raise APIError(f"Unknown model handler: {mid}")
754
+
755
+ # Default extraction for simple results
756
+ if isinstance(result, str):
757
+ return result.strip()
758
+ if isinstance(result, dict):
759
+ return json.dumps(result, ensure_ascii=False)
760
+ if isinstance(result, (list, tuple)):
761
+ return str(result[0]).strip() if result else ""
762
+ return str(result)
763
+
764
+ except APIError:
765
+ raise
766
+ except Exception as e:
767
+ raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
768
+
769
+ def _extract_glm(self, result, include_thinking: bool = True) -> str:
770
+ if isinstance(result, tuple) and len(result) >= 1:
771
+ chatbot = result[0]
772
+ if isinstance(chatbot, list) and chatbot:
773
+ for msg in reversed(chatbot):
774
+ if isinstance(msg, dict) and msg.get("role") == "assistant":
775
+ content = msg.get("content", "")
776
+ raw = content if isinstance(content, str) else str(content)
777
+ return ResponseCleaner.clean_glm(raw, include_thinking)
778
+ last = chatbot[-1]
779
+ if isinstance(last, dict):
780
+ raw = last.get("content", "")
781
+ raw = raw if isinstance(raw, str) else str(raw)
782
+ return ResponseCleaner.clean_glm(raw, include_thinking)
783
+ return ResponseCleaner.clean_glm(str(chatbot), include_thinking)
784
+ if isinstance(result, str):
785
+ return ResponseCleaner.clean_glm(result, include_thinking)
786
+ return ResponseCleaner.clean_glm(str(result), include_thinking)
787
+
788
+
789
+ # Factory
790
+ def create_provider(model_id: str, config: Config) -> ModelProvider:
791
+ if model_id not in MODEL_REGISTRY:
792
+ raise ModelNotFoundError(model_id)
793
+ mdef = MODEL_REGISTRY[model_id]
794
+ if model_id == "gpt-oss-120b":
795
+ return GptOssProvider(mdef, config)
796
+ return GradioClientProvider(mdef, config)
797
+
798
+ # ═══════════════════════════════════════════════════════════════
799
+ # MULTI-MODEL CLIENT
800
+ # ═══════════════════════════════════════════════════════════════
801
+
802
+ class MultiModelClient:
803
+ def __init__(self, config: Config):
804
+ self.config = config
805
+ self._providers: Dict[str, ModelProvider] = {}
806
+ self._lock = threading.Lock()
807
+ self._conversations: Dict[str, Conversation] = {}
808
+ self._active_conv_id: Optional[str] = None
809
+ self._current_model = config.default_model
810
+ self.rate_limiter = RateLimiter(config.rate_limit_rpm, config.rate_limit_burst)
811
+ self.circuit_breaker = CircuitBreaker()
812
+
813
+ @property
814
+ def current_model(self):
815
+ return self._current_model
816
+
817
+ @current_model.setter
818
+ def current_model(self, m):
819
+ if m not in MODEL_REGISTRY:
820
+ raise ModelNotFoundError(m)
821
+ self._current_model = m
822
+
823
+ def _get_provider(self, model_id: str) -> ModelProvider:
824
+ if model_id not in self._providers:
825
+ with self._lock:
826
+ if model_id not in self._providers:
827
+ self._providers[model_id] = create_provider(model_id, self.config)
828
+ return self._providers[model_id]
829
+
830
+ def _ensure_ready(self, model_id: str) -> ModelProvider:
831
+ p = self._get_provider(model_id)
832
+ if not p.ready:
833
+ if not p.initialize():
834
+ raise APIError(f"Cannot init {model_id}", "INIT_FAILED")
835
+ return p
836
+
837
+ @property
838
+ def active_conversation(self) -> Conversation:
839
+ if self._active_conv_id not in self._conversations:
840
+ conv = Conversation(system_prompt=self.config.default_system_prompt, model_id=self._current_model)
841
+ self._conversations[conv.conversation_id] = conv
842
+ self._active_conv_id = conv.conversation_id
843
+ return self._conversations[self._active_conv_id]
844
+
845
+ def new_conversation(self, system_prompt=None, model_id=None) -> Conversation:
846
+ conv = Conversation(system_prompt=system_prompt or self.config.default_system_prompt,
847
+ model_id=model_id or self._current_model)
848
+ self._conversations[conv.conversation_id] = conv
849
+ self._active_conv_id = conv.conversation_id
850
+ return conv
851
+
852
+ def init_model(self, model_id: str) -> bool:
853
+ try:
854
+ return self._get_provider(model_id).initialize()
855
+ except:
856
+ return False
857
+
858
+ def send_message(self, message: str, *, stream: bool = False, model: Optional[str] = None,
859
+ conversation_id: Optional[str] = None, system_prompt: Optional[str] = None,
860
+ temperature: Optional[float] = None, max_tokens: Optional[int] = None,
861
+ include_thinking: Optional[bool] = None, **kwargs) -> Union[str, Generator]:
862
+ model_id = model or self._current_model
863
+ if model_id not in MODEL_REGISTRY:
864
+ raise ModelNotFoundError(model_id)
865
+ mdef = MODEL_REGISTRY[model_id]
866
+ message = message.strip()
867
+ if not message:
868
+ raise APIError("Empty message", "INVALID_INPUT", 400)
869
+ if len(message) > self.config.max_message_length:
870
+ raise APIError("Message too long", "INVALID_INPUT", 400)
871
+ if not self.circuit_breaker.can_execute():
872
+ raise APIError("Circuit breaker open", "CIRCUIT_OPEN", 503)
873
+ if not self.rate_limiter.acquire(timeout=10.0):
874
+ raise APIError("Rate limited", "RATE_LIMITED", 429)
875
+
876
+ conv = self._conversations.get(conversation_id, self.active_conversation) if conversation_id else self.active_conversation
877
+ conv.model_id = model_id
878
+ if system_prompt:
879
+ conv.system_prompt = system_prompt
880
+
881
+ history = conv.build_gradio_history() if mdef.supports_history else None
882
+ conv.add_message("user", message, self.config.max_history_messages)
883
+
884
+ eff_temp = temperature if temperature is not None else mdef.default_temperature
885
+ eff_sys = conv.system_prompt if mdef.supports_system_prompt else None
886
+ eff_thinking = include_thinking if include_thinking is not None else self.config.include_thinking
887
+
888
+ extra = dict(kwargs)
889
+ if mdef.supports_thinking:
890
+ extra["include_thinking"] = eff_thinking
891
+
892
+ start = time.monotonic()
893
+
894
+ for attempt in range(self.config.max_retries + 1):
895
+ try:
896
+ if attempt > 0:
897
+ time.sleep(self.config.retry_backoff_base ** attempt + random.uniform(0, self.config.retry_jitter))
898
+ metrics.record_retry()
899
+
900
+ provider = self._ensure_ready(model_id)
901
+
902
+ if stream and mdef.supports_streaming:
903
+ gen = provider.generate_stream(message, history=history, system_prompt=eff_sys,
904
+ temperature=eff_temp, max_tokens=max_tokens, **extra)
905
+ return self._wrap_stream(gen, conv, start, model_id)
906
+
907
+ result = provider.generate(message, history=history, system_prompt=eff_sys,
908
+ temperature=eff_temp, max_tokens=max_tokens, **extra)
909
+ dur = (time.monotonic() - start) * 1000
910
+ thinking, response = ThinkingParser.split(result)
911
+ conv.add_message("assistant", response, self.config.max_history_messages, thinking=thinking)
912
+ metrics.record_request(True, dur, len(result), model_id)
913
+ self.circuit_breaker.record_success()
914
+ return result
915
+
916
+ except APIError:
917
+ self.circuit_breaker.record_failure()
918
+ if attempt == self.config.max_retries:
919
+ dur = (time.monotonic() - start) * 1000
920
+ metrics.record_request(False, dur, model=model_id)
921
+ raise
922
+ except Exception as e:
923
+ self.circuit_breaker.record_failure()
924
+ if attempt == self.config.max_retries:
925
+ dur = (time.monotonic() - start) * 1000
926
+ metrics.record_request(False, dur, model=model_id)
927
+ raise APIError(str(e))
928
+
929
+ def _wrap_stream(self, gen, conv, start, model_id):
930
+ full = ""
931
+ try:
932
+ for chunk in gen:
933
+ full += chunk
934
+ yield chunk
935
+ thinking, response = ThinkingParser.split(full)
936
+ conv.add_message("assistant", response, self.config.max_history_messages, thinking=thinking)
937
+ metrics.record_request(True, (time.monotonic() - start) * 1000, len(full), model_id)
938
+ self.circuit_breaker.record_success()
939
+ except Exception:
940
+ metrics.record_request(False, (time.monotonic() - start) * 1000, model=model_id)
941
+ self.circuit_breaker.record_failure()
942
+ raise
943
+
944
+ def get_status(self) -> Dict:
945
+ return {
946
+ "version": VERSION, "current_model": self._current_model,
947
+ "models": list(MODEL_REGISTRY.keys()),
948
+ "providers": {m: "READY" if p.ready else "NOT_READY" for m, p in self._providers.items()},
949
+ "conversations": len(self._conversations),
950
+ "circuit_breaker": self.circuit_breaker.state,
951
+ }
952
+
953
+ # ═══════════════════════════════════════════════════════════════
954
+ # SESSION POOL
955
+ # ═══════════════════════════════════════════════════════════════
956
+
957
+ class SessionPool:
958
+ def __init__(self, config: Config):
959
+ self.config = config
960
+ self._clients = [MultiModelClient(config) for _ in range(config.pool_size)]
961
+ self._idx = 0
962
+ self._lock = threading.Lock()
963
+
964
+ def init_default(self):
965
+ for c in self._clients:
966
+ c.init_model(self.config.default_model)
967
+
968
+ def init_model(self, model_id: str) -> int:
969
+ return sum(1 for c in self._clients if c.init_model(model_id))
970
+
971
+ def acquire(self) -> MultiModelClient:
972
+ with self._lock:
973
+ c = self._clients[self._idx % len(self._clients)]
974
+ self._idx += 1
975
+ return c
976
+
977
+ # ═══════════════════════════════════════════════════════════════
978
+ # MODEL ALIAS RESOLVER
979
+ # ═══════════════════════════════════════════════════════════════
980
+
981
+ ALIASES = {
982
+ "gpt-oss": "gpt-oss-120b", "gptoss": "gpt-oss-120b", "amd": "gpt-oss-120b",
983
+ "command-a": "command-a-vision", "command-vision": "command-a-vision", "cohere-vision": "command-a-vision",
984
+ "command-translate": "command-a-translate", "cohere-translate": "command-a-translate", "translate": "command-a-translate",
985
+ "minimax": "minimax-vl-01", "minimax-vl": "minimax-vl-01",
986
+ "glm": "glm-4.5", "glm4": "glm-4.5", "glm-4": "glm-4.5", "zhipu": "glm-4.5",
987
+ "gpt": "chatgpt", "gpt-3.5": "chatgpt", "gpt3": "chatgpt", "openai": "chatgpt",
988
+ "qwen": "qwen3-vl", "qwen3": "qwen3-vl", "qwen-vl": "qwen3-vl",
989
+ }
990
+
991
+ def resolve_alias(model_id: str) -> str:
992
+ return ALIASES.get(model_id.lower(), model_id)
993
+
994
+ # ═══════════════════════════════════════════════════════════════
995
+ # FLASK APP
996
+ # ═══════════════════════════════════════════════════════════════
997
+
998
+ config = Config.from_env()
999
+ pool = SessionPool(config)
1000
+ pool.init_default()
1001
+
1002
+ app = Flask(APP_NAME)
1003
+
1004
+ @app.after_request
1005
+ def cors(response):
1006
+ response.headers["Access-Control-Allow-Origin"] = "*"
1007
+ response.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
1008
+ response.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
1009
+ return response
1010
+
1011
+ @app.errorhandler(APIError)
1012
+ def handle_api_error(e: APIError):
1013
+ return jsonify({"ok": False, **e.to_dict()}), e.status
1014
+
1015
+ @app.route("/")
1016
+ def index():
1017
+ return jsonify({
1018
+ "name": APP_NAME, "version": VERSION,
1019
+ "default_model": config.default_model,
1020
+ "models": list(MODEL_REGISTRY.keys()),
1021
+ "endpoints": {
1022
+ "POST /chat": "Chat with any model",
1023
+ "POST /chat/stream": "Streaming chat",
1024
+ "POST /v1/chat/completions": "OpenAI-compatible",
1025
+ "GET /v1/models": "List models",
1026
+ "POST /models/init": "Init a model",
1027
+ "GET /health": "Health check",
1028
+ "GET /metrics": "Metrics",
1029
+ },
1030
+ })
1031
+
1032
+ @app.route("/chat", methods=["POST"])
1033
+ def chat():
1034
+ data = freq.get_json(force=True, silent=True) or {}
1035
+ message = data.get("message", "").strip()
1036
+ if not message:
1037
+ return jsonify({"ok": False, "error": "'message' required"}), 400
1038
+ model_id = resolve_alias(data.get("model", config.default_model))
1039
+ include_thinking = data.get("include_thinking", config.include_thinking)
1040
+ client = pool.acquire()
1041
+ if data.get("new_conversation"):
1042
+ client.new_conversation(data.get("system_prompt"), model_id)
1043
+ result = client.send_message(message, model=model_id, system_prompt=data.get("system_prompt"),
1044
+ temperature=data.get("temperature"), max_tokens=data.get("max_tokens"),
1045
+ include_thinking=include_thinking)
1046
+ thinking, clean = ThinkingParser.split(result)
1047
+ resp = {"ok": True, "response": clean, "model": model_id,
1048
+ "conversation_id": client.active_conversation.conversation_id,
1049
+ "history_size": len(client.active_conversation.messages)}
1050
+ if thinking:
1051
+ resp["thinking"] = thinking
1052
+ return jsonify(resp)
1053
+
1054
+ @app.route("/chat/stream", methods=["POST"])
1055
+ def chat_stream():
1056
+ data = freq.get_json(force=True, silent=True) or {}
1057
+ message = data.get("message", "").strip()
1058
+ if not message:
1059
+ return jsonify({"ok": False, "error": "'message' required"}), 400
1060
+ model_id = resolve_alias(data.get("model", config.default_model))
1061
+ include_thinking = data.get("include_thinking", config.include_thinking)
1062
+ client = pool.acquire()
1063
+ if data.get("new_conversation"):
1064
+ client.new_conversation(data.get("system_prompt"), model_id)
1065
+ mdef = MODEL_REGISTRY.get(model_id)
1066
+ use_stream = mdef.supports_streaming if mdef else False
1067
+
1068
+ def generate():
1069
+ try:
1070
+ if use_stream:
1071
+ for chunk in client.send_message(message, stream=True, model=model_id,
1072
+ system_prompt=data.get("system_prompt"),
1073
+ temperature=data.get("temperature"),
1074
+ max_tokens=data.get("max_tokens"),
1075
+ include_thinking=include_thinking):
1076
+ yield f"data: {json.dumps({'chunk': chunk})}\n\n"
1077
+ else:
1078
+ result = client.send_message(message, model=model_id,
1079
+ system_prompt=data.get("system_prompt"),
1080
+ temperature=data.get("temperature"),
1081
+ max_tokens=data.get("max_tokens"),
1082
+ include_thinking=include_thinking)
1083
+ yield f"data: {json.dumps({'chunk': result})}\n\n"
1084
+ yield "data: [DONE]\n\n"
1085
+ except APIError as e:
1086
+ yield f"data: {json.dumps(e.to_dict())}\n\n"
1087
+
1088
+ return Response(stream_with_context(generate()), content_type="text/event-stream")
1089
+
1090
+ @app.route("/v1/models", methods=["GET"])
1091
+ def list_models():
1092
+ models = []
1093
+ for mid, mdef in MODEL_REGISTRY.items():
1094
+ models.append({
1095
+ "id": mid, "object": "model", "owned_by": mdef.owned_by, "created": 0,
1096
+ "description": mdef.description,
1097
+ "capabilities": {
1098
+ "vision": mdef.supports_vision, "streaming": mdef.supports_streaming,
1099
+ "system_prompt": mdef.supports_system_prompt, "temperature": mdef.supports_temperature,
1100
+ "history": mdef.supports_history, "thinking": mdef.supports_thinking,
1101
+ },
1102
+ })
1103
+ return jsonify({"object": "list", "data": models})
1104
+
1105
+ @app.route("/v1/chat/completions", methods=["POST", "OPTIONS"])
1106
+ def openai_compat():
1107
+ if freq.method == "OPTIONS":
1108
+ return "", 200
1109
+ data = freq.get_json(force=True, silent=True) or {}
1110
+ messages = data.get("messages", [])
1111
+ do_stream = data.get("stream", False)
1112
+ temperature = data.get("temperature")
1113
+ max_tokens = data.get("max_tokens")
1114
+ model_id = resolve_alias(data.get("model", config.default_model))
1115
+ include_thinking = data.get("include_thinking", config.include_thinking)
1116
+
1117
+ if model_id not in MODEL_REGISTRY:
1118
+ return jsonify({"error": {"message": f"Model '{model_id}' not found", "type": "invalid_request_error"}}), 404
1119
+ if not messages:
1120
+ return jsonify({"error": {"message": "messages required"}}), 400
1121
+
1122
+ user_msg = system_prompt = None
1123
+ for msg in messages:
1124
+ if msg.get("role") == "system":
1125
+ system_prompt = msg.get("content")
1126
+ if msg.get("role") == "user":
1127
+ user_msg = msg.get("content", "")
1128
+ if not user_msg:
1129
+ return jsonify({"error": {"message": "No user message"}}), 400
1130
+
1131
+ rid = f"chatcmpl-{uuid.uuid4().hex[:29]}"
1132
+ created = int(time.time())
1133
+ client = pool.acquire()
1134
+ client.new_conversation(system_prompt, model_id)
1135
+
1136
+ # Add history from messages
1137
+ for msg in messages[:-1]:
1138
+ role = msg.get("role")
1139
+ content = msg.get("content", "")
1140
+ if role in ("user", "assistant") and content:
1141
+ client.active_conversation.add_message(role, content)
1142
+
1143
+ mdef = MODEL_REGISTRY[model_id]
1144
+
1145
+ if do_stream:
1146
+ def generate():
1147
+ try:
1148
+ yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'role': 'assistant'}, 'finish_reason': None}]})}\n\n"
1149
+ if mdef.supports_streaming:
1150
+ for chunk in client.send_message(user_msg, stream=True, model=model_id,
1151
+ temperature=temperature, max_tokens=max_tokens,
1152
+ include_thinking=include_thinking):
1153
+ yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
1154
+ else:
1155
+ result = client.send_message(user_msg, model=model_id, temperature=temperature,
1156
+ max_tokens=max_tokens, include_thinking=include_thinking)
1157
+ yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': result}, 'finish_reason': None}]})}\n\n"
1158
+ yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
1159
+ yield "data: [DONE]\n\n"
1160
+ except Exception as e:
1161
+ yield f"data: {json.dumps({'error': {'message': str(e)}})}\n\n"
1162
+ return Response(stream_with_context(generate()), content_type="text/event-stream")
1163
+
1164
+ result = client.send_message(user_msg, model=model_id, temperature=temperature,
1165
+ max_tokens=max_tokens, include_thinking=include_thinking)
1166
+ return jsonify({
1167
+ "id": rid, "object": "chat.completion", "created": created, "model": model_id,
1168
+ "choices": [{"index": 0, "message": {"role": "assistant", "content": result}, "finish_reason": "stop"}],
1169
+ "usage": {"prompt_tokens": len(user_msg) // 4, "completion_tokens": len(result) // 4,
1170
+ "total_tokens": (len(user_msg) + len(result)) // 4},
1171
+ })
1172
+
1173
+ @app.route("/new", methods=["POST"])
1174
+ def new_conv():
1175
+ data = freq.get_json(force=True, silent=True) or {}
1176
+ model_id = resolve_alias(data.get("model", config.default_model))
1177
+ client = pool.acquire()
1178
+ conv = client.new_conversation(data.get("system_prompt"), model_id)
1179
+ return jsonify({"ok": True, "conversation_id": conv.conversation_id, "model": model_id})
1180
+
1181
+ @app.route("/health", methods=["GET"])
1182
+ def health():
1183
+ client = pool.acquire()
1184
+ return jsonify(client.get_status())
1185
+
1186
+ @app.route("/metrics", methods=["GET"])
1187
+ def metrics_endpoint():
1188
+ return jsonify(metrics.to_dict())
1189
+
1190
+ @app.route("/conversations", methods=["GET"])
1191
+ def conversations():
1192
+ client = pool.acquire()
1193
+ return jsonify({"conversations": [c.to_dict() for c in client._conversations.values()]})
1194
+
1195
+ @app.route("/models/init", methods=["POST"])
1196
+ def init_model_ep():
1197
+ data = freq.get_json(force=True, silent=True) or {}
1198
+ model_id = resolve_alias(data.get("model", ""))
1199
+ if not model_id or model_id not in MODEL_REGISTRY:
1200
+ return jsonify({"ok": False, "error": f"Unknown model. Available: {list(MODEL_REGISTRY.keys())}"}), 400
1201
+ count = pool.init_model(model_id)
1202
+ return jsonify({"ok": True, "model": model_id, "initialized_clients": count})
1203
+
1204
+ # ═══════════════════════════════════════════════════════════════
1205
+ # ENTRY POINT (for HuggingFace Spaces)
1206
+ # ═══════════════════════════════════════════════════════════════
1207
+
1208
+ if __name__ == "__main__":
1209
+ port = int(os.environ.get("PORT", 7860))
1210
+ log.info(f"Starting Multi-Model AI API v{VERSION} on port {port}")
1211
+ log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
1212
+ app.run(host="0.0.0.0", port=port, threaded=True)