Spaces:
Running
Running
Update app.py
#1
by
Voxxium - opened
app.py
CHANGED
|
@@ -1,20 +1,21 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Multi-Model AI API β HuggingFace Spaces Edition
|
| 4 |
-
With load balancing
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
import re, os, json, uuid, time, random, string, logging, threading
|
| 8 |
from abc import ABC, abstractmethod
|
| 9 |
from collections import deque
|
| 10 |
from dataclasses import dataclass, field
|
| 11 |
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
|
|
|
|
| 12 |
|
| 13 |
import requests
|
| 14 |
from flask import Flask, request as freq, jsonify, Response, stream_with_context
|
| 15 |
|
| 16 |
try:
|
| 17 |
-
from gradio_client import Client as GradioClient
|
| 18 |
HAS_GRADIO_CLIENT = True
|
| 19 |
except ImportError:
|
| 20 |
HAS_GRADIO_CLIENT = False
|
|
@@ -23,7 +24,7 @@ except ImportError:
|
|
| 23 |
# CONFIG & CONSTANTS
|
| 24 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
|
| 26 |
-
VERSION = "
|
| 27 |
APP_NAME = "Multi-Model-AI-API"
|
| 28 |
DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
|
| 29 |
DEFAULT_MODEL = "gpt-oss-120b"
|
|
@@ -38,6 +39,83 @@ USER_AGENTS = [
|
|
| 38 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
|
| 39 |
]
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
# MODEL REGISTRY
|
| 43 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -65,7 +143,8 @@ class ModelDef:
|
|
| 65 |
clean_analysis: bool = False
|
| 66 |
lb_pool_size: int = 2
|
| 67 |
lb_enabled: bool = True
|
| 68 |
-
is_beta: bool = False
|
|
|
|
| 69 |
|
| 70 |
MODEL_REGISTRY: Dict[str, ModelDef] = {}
|
| 71 |
|
|
@@ -101,9 +180,8 @@ def _init_registry():
|
|
| 101 |
supports_temperature=False, supports_streaming=False, supports_history=False,
|
| 102 |
supports_thinking=False, max_tokens_default=700,
|
| 103 |
extra_params={"max_new_tokens": 700},
|
| 104 |
-
lb_pool_size=1, lb_enabled=False,
|
| 105 |
))
|
| 106 |
-
# ββ NEW: Command-A Reasoning ββ
|
| 107 |
register_model(ModelDef(
|
| 108 |
model_id="command-a-reasoning", display_name="Cohere Command-A Reasoning",
|
| 109 |
provider_type="gradio_client", space_id="CohereLabs/command-a-reasoning",
|
|
@@ -153,7 +231,6 @@ def _init_registry():
|
|
| 153 |
supports_thinking=False, max_tokens_default=4096,
|
| 154 |
lb_pool_size=2, lb_enabled=True,
|
| 155 |
))
|
| 156 |
-
# ββ NEW: Qwen2.5-Coder (BETA) ββ
|
| 157 |
register_model(ModelDef(
|
| 158 |
model_id="qwen2.5-coder", display_name="Qwen2.5-Coder Artifacts (BETA)",
|
| 159 |
provider_type="gradio_client", space_id="Qwen/Qwen2.5-Coder-Artifacts",
|
|
@@ -194,7 +271,7 @@ class Config:
|
|
| 194 |
rate_limit_burst: int = 15
|
| 195 |
pool_size: int = 2
|
| 196 |
max_history_messages: int = 50
|
| 197 |
-
max_message_length: int =
|
| 198 |
default_temperature: float = 0.7
|
| 199 |
include_thinking: bool = True
|
| 200 |
log_sse_raw: bool = False
|
|
@@ -376,20 +453,15 @@ class ResponseCleaner:
|
|
| 376 |
|
| 377 |
@classmethod
|
| 378 |
def extract_qwen_coder_text(cls, result: Any) -> str:
|
| 379 |
-
"""Extract text from Qwen2.5-Coder /generation_code response.
|
| 380 |
-
Returns tuple of (markdown, html). We want the markdown part."""
|
| 381 |
if result is None:
|
| 382 |
return ""
|
| 383 |
if isinstance(result, str):
|
| 384 |
return result.strip()
|
| 385 |
if isinstance(result, tuple):
|
| 386 |
-
# /generation_code returns (markdown_str, html_str)
|
| 387 |
-
# We want the markdown part (index 0)
|
| 388 |
if len(result) >= 1 and isinstance(result[0], str):
|
| 389 |
text = result[0].strip()
|
| 390 |
if text:
|
| 391 |
return text
|
| 392 |
-
# Fallback to second element if first is empty
|
| 393 |
if len(result) >= 2 and isinstance(result[1], str):
|
| 394 |
return result[1].strip()
|
| 395 |
if isinstance(result, (list, dict)):
|
|
@@ -568,7 +640,7 @@ class Metrics:
|
|
| 568 |
metrics = Metrics()
|
| 569 |
|
| 570 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 571 |
-
# RATE LIMITER β
|
| 572 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 573 |
|
| 574 |
class RateLimiter:
|
|
@@ -713,7 +785,7 @@ class ModelProvider(ABC):
|
|
| 713 |
|
| 714 |
@abstractmethod
|
| 715 |
def generate(self, message: str, history=None, system_prompt=None,
|
| 716 |
-
temperature=None, max_tokens=None, **kwargs) -> str: ...
|
| 717 |
|
| 718 |
def generate_stream(self, message: str, **kwargs) -> Generator[str, None, None]:
|
| 719 |
yield self.generate(message, **kwargs)
|
|
@@ -795,7 +867,7 @@ class GptOssProvider(ModelProvider):
|
|
| 795 |
return False
|
| 796 |
|
| 797 |
def generate(self, message, history=None, system_prompt=None,
|
| 798 |
-
temperature=None, max_tokens=None, **kw):
|
| 799 |
if not self.ready:
|
| 800 |
self.initialize()
|
| 801 |
sys_p = system_prompt or self.config.default_system_prompt
|
|
@@ -848,7 +920,7 @@ class GptOssProvider(ModelProvider):
|
|
| 848 |
if self.model_def.clean_analysis else full)
|
| 849 |
|
| 850 |
def generate_stream(self, message, history=None, system_prompt=None,
|
| 851 |
-
temperature=None, max_tokens=None, **kw):
|
| 852 |
if not self.ready:
|
| 853 |
self.initialize()
|
| 854 |
sys_p = system_prompt or self.config.default_system_prompt
|
|
@@ -933,19 +1005,31 @@ class GradioClientProvider(ModelProvider):
|
|
| 933 |
return False
|
| 934 |
|
| 935 |
def generate(self, message, history=None, system_prompt=None,
|
| 936 |
-
temperature=None, max_tokens=None, **kw):
|
| 937 |
if not self.ready:
|
| 938 |
self.initialize()
|
| 939 |
if not self._client:
|
| 940 |
raise APIError(f"{self.model_def.model_id} not initialized")
|
| 941 |
|
| 942 |
mid = self.model_def.model_id
|
|
|
|
|
|
|
| 943 |
try:
|
| 944 |
if mid == "command-a-vision":
|
| 945 |
max_new = (max_tokens
|
| 946 |
or self.model_def.extra_params.get("max_new_tokens", 700))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 947 |
result = self._client.predict(
|
| 948 |
-
message=
|
| 949 |
max_new_tokens=max_new,
|
| 950 |
api_name=self.model_def.api_name,
|
| 951 |
)
|
|
@@ -960,7 +1044,6 @@ class GradioClientProvider(ModelProvider):
|
|
| 960 |
)
|
| 961 |
|
| 962 |
elif mid == "command-a-reasoning":
|
| 963 |
-
# Cohere Command-A Reasoning with thinking budget
|
| 964 |
thinking_budget = kw.get(
|
| 965 |
"thinking_budget",
|
| 966 |
self.model_def.extra_params.get("thinking_budget", 500),
|
|
@@ -979,8 +1062,14 @@ class GradioClientProvider(ModelProvider):
|
|
| 979 |
or self.model_def.extra_params.get("max_tokens", 12800))
|
| 980 |
top_p = kw.get("top_p",
|
| 981 |
self.model_def.extra_params.get("top_p", 0.9))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 982 |
result = self._client.predict(
|
| 983 |
-
message={"text": message, "files":
|
| 984 |
max_tokens=max_tok, temperature=temp, top_p=top_p,
|
| 985 |
api_name=self.model_def.api_name,
|
| 986 |
)
|
|
@@ -1019,14 +1108,22 @@ class GradioClientProvider(ModelProvider):
|
|
| 1019 |
return ResponseCleaner.extract_chatgpt_text(result)
|
| 1020 |
|
| 1021 |
elif mid == "qwen3-vl":
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1026 |
return ResponseCleaner.extract_qwen_text(result)
|
| 1027 |
|
| 1028 |
elif mid == "qwen2.5-coder":
|
| 1029 |
-
# First set the system prompt to override artifacts behavior
|
| 1030 |
sys_override = self.model_def.extra_params.get(
|
| 1031 |
"system_prompt_override", ""
|
| 1032 |
)
|
|
@@ -1062,33 +1159,27 @@ class GradioClientProvider(ModelProvider):
|
|
| 1062 |
raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
|
| 1063 |
|
| 1064 |
def _extract_reasoning(self, result: Any) -> str:
|
| 1065 |
-
"""Extract response from Command-A Reasoning.
|
| 1066 |
-
The API returns str | float | bool | list | dict from the Json component."""
|
| 1067 |
if result is None:
|
| 1068 |
return ""
|
| 1069 |
if isinstance(result, str):
|
| 1070 |
return result.strip()
|
| 1071 |
if isinstance(result, dict):
|
| 1072 |
-
# Try common response keys
|
| 1073 |
for key in ("response", "output", "answer", "text", "content", "result"):
|
| 1074 |
if key in result:
|
| 1075 |
val = result[key]
|
| 1076 |
if isinstance(val, str):
|
| 1077 |
return val.strip()
|
| 1078 |
return str(val)
|
| 1079 |
-
# Check for thinking + response structure
|
| 1080 |
thinking = result.get("thinking", "")
|
| 1081 |
response = result.get("response", result.get("output", ""))
|
| 1082 |
if thinking and response:
|
| 1083 |
return f"<thinking>\n{thinking}\n</thinking>\n{response}"
|
| 1084 |
if response:
|
| 1085 |
return str(response).strip()
|
| 1086 |
-
# Fallback: serialize entire dict
|
| 1087 |
return json.dumps(result, ensure_ascii=False, indent=2)
|
| 1088 |
if isinstance(result, (list, tuple)):
|
| 1089 |
if len(result) == 1:
|
| 1090 |
return str(result[0]).strip()
|
| 1091 |
-
# Try to find text in list elements
|
| 1092 |
texts = []
|
| 1093 |
for item in result:
|
| 1094 |
if isinstance(item, str) and item.strip():
|
|
@@ -1120,7 +1211,6 @@ class GradioClientProvider(ModelProvider):
|
|
| 1120 |
return ResponseCleaner.clean_glm(str(result), include_thinking)
|
| 1121 |
|
| 1122 |
|
| 1123 |
-
# Factory
|
| 1124 |
def create_provider(model_id: str, config: Config,
|
| 1125 |
instance_id: int = 0) -> ModelProvider:
|
| 1126 |
if model_id not in MODEL_REGISTRY:
|
|
@@ -1253,7 +1343,7 @@ class LoadBalancedProviderPool:
|
|
| 1253 |
inst.record_failure()
|
| 1254 |
log.warning(
|
| 1255 |
f"[LB] Failover instance {inst.instance_id} "
|
| 1256 |
-
f"for '{self.model_id}'
|
| 1257 |
)
|
| 1258 |
|
| 1259 |
raise APIError(
|
|
@@ -1336,7 +1426,7 @@ class LoadBalancedProviderPool:
|
|
| 1336 |
}
|
| 1337 |
|
| 1338 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1339 |
-
# MULTI-MODEL CLIENT
|
| 1340 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1341 |
|
| 1342 |
class MultiModelClient:
|
|
@@ -1415,7 +1505,7 @@ class MultiModelClient:
|
|
| 1415 |
|
| 1416 |
def send_message(
|
| 1417 |
self,
|
| 1418 |
-
message:
|
| 1419 |
*,
|
| 1420 |
stream: bool = False,
|
| 1421 |
model: Optional[str] = None,
|
|
@@ -1424,14 +1514,27 @@ class MultiModelClient:
|
|
| 1424 |
temperature: Optional[float] = None,
|
| 1425 |
max_tokens: Optional[int] = None,
|
| 1426 |
include_thinking: Optional[bool] = None,
|
|
|
|
| 1427 |
**kwargs,
|
| 1428 |
) -> Union[str, Generator]:
|
| 1429 |
model_id = model or self._current_model
|
| 1430 |
if model_id not in MODEL_REGISTRY:
|
| 1431 |
raise ModelNotFoundError(model_id)
|
| 1432 |
mdef = MODEL_REGISTRY[model_id]
|
| 1433 |
-
|
| 1434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1435 |
raise APIError("Empty message", "INVALID_INPUT", 400)
|
| 1436 |
if len(message) > self.config.max_message_length:
|
| 1437 |
raise APIError("Message too long", "INVALID_INPUT", 400)
|
|
@@ -1479,6 +1582,7 @@ class MultiModelClient:
|
|
| 1479 |
system_prompt=eff_sys,
|
| 1480 |
temperature=eff_temp,
|
| 1481 |
max_tokens=max_tokens,
|
|
|
|
| 1482 |
**extra,
|
| 1483 |
)
|
| 1484 |
return self._wrap_stream(gen, conv, start, model_id)
|
|
@@ -1490,6 +1594,7 @@ class MultiModelClient:
|
|
| 1490 |
system_prompt=eff_sys,
|
| 1491 |
temperature=eff_temp,
|
| 1492 |
max_tokens=max_tokens,
|
|
|
|
| 1493 |
**extra,
|
| 1494 |
)
|
| 1495 |
dur = (time.monotonic() - start) * 1000
|
|
@@ -1581,7 +1686,7 @@ class SessionPool:
|
|
| 1581 |
return c
|
| 1582 |
|
| 1583 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1584 |
-
#
|
| 1585 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1586 |
|
| 1587 |
ALIASES = {
|
|
@@ -1602,6 +1707,8 @@ ALIASES = {
|
|
| 1602 |
|
| 1603 |
|
| 1604 |
def resolve_alias(model_id: str) -> str:
|
|
|
|
|
|
|
| 1605 |
return ALIASES.get(model_id.lower(), model_id)
|
| 1606 |
|
| 1607 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1634,18 +1741,19 @@ def index():
|
|
| 1634 |
"name": APP_NAME,
|
| 1635 |
"version": VERSION,
|
| 1636 |
"default_model": config.default_model,
|
| 1637 |
-
"features": ["load_balancing", "10_req_per_second_limit", "failover"],
|
| 1638 |
"models": list(MODEL_REGISTRY.keys()),
|
| 1639 |
"beta_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.is_beta],
|
|
|
|
| 1640 |
"endpoints": {
|
| 1641 |
"POST /chat": "Chat with any model",
|
| 1642 |
"POST /chat/stream": "Streaming chat",
|
| 1643 |
-
"POST /v1/chat/completions": "OpenAI-compatible",
|
| 1644 |
"GET /v1/models": "List models",
|
| 1645 |
"POST /models/init": "Init a model",
|
| 1646 |
-
"GET /health": "Health check
|
| 1647 |
"GET /metrics": "Metrics",
|
| 1648 |
-
"GET /lb/status": "Load balancer
|
| 1649 |
},
|
| 1650 |
})
|
| 1651 |
|
|
@@ -1653,16 +1761,26 @@ def index():
|
|
| 1653 |
@app.route("/chat", methods=["POST"])
|
| 1654 |
def chat():
|
| 1655 |
data = freq.get_json(force=True, silent=True) or {}
|
| 1656 |
-
|
| 1657 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1658 |
return jsonify({"ok": False, "error": "'message' required"}), 400
|
|
|
|
| 1659 |
model_id = resolve_alias(data.get("model", config.default_model))
|
| 1660 |
include_thinking = data.get("include_thinking", config.include_thinking)
|
| 1661 |
client = pool.acquire()
|
| 1662 |
if data.get("new_conversation"):
|
| 1663 |
client.new_conversation(data.get("system_prompt"), model_id)
|
| 1664 |
|
| 1665 |
-
# Pass extra params for specific models
|
| 1666 |
extra = {}
|
| 1667 |
if model_id == "command-a-reasoning" and "thinking_budget" in data:
|
| 1668 |
extra["thinking_budget"] = data["thinking_budget"]
|
|
@@ -1673,6 +1791,7 @@ def chat():
|
|
| 1673 |
temperature=data.get("temperature"),
|
| 1674 |
max_tokens=data.get("max_tokens"),
|
| 1675 |
include_thinking=include_thinking,
|
|
|
|
| 1676 |
**extra,
|
| 1677 |
)
|
| 1678 |
thinking, clean = ThinkingParser.split(result)
|
|
@@ -1694,9 +1813,19 @@ def chat():
|
|
| 1694 |
@app.route("/chat/stream", methods=["POST"])
|
| 1695 |
def chat_stream():
|
| 1696 |
data = freq.get_json(force=True, silent=True) or {}
|
| 1697 |
-
|
| 1698 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1699 |
return jsonify({"ok": False, "error": "'message' required"}), 400
|
|
|
|
| 1700 |
model_id = resolve_alias(data.get("model", config.default_model))
|
| 1701 |
include_thinking = data.get("include_thinking", config.include_thinking)
|
| 1702 |
client = pool.acquire()
|
|
@@ -1718,6 +1847,7 @@ def chat_stream():
|
|
| 1718 |
temperature=data.get("temperature"),
|
| 1719 |
max_tokens=data.get("max_tokens"),
|
| 1720 |
include_thinking=include_thinking,
|
|
|
|
| 1721 |
**extra,
|
| 1722 |
):
|
| 1723 |
yield f"data: {json.dumps({'chunk': chunk})}\n\n"
|
|
@@ -1728,6 +1858,7 @@ def chat_stream():
|
|
| 1728 |
temperature=data.get("temperature"),
|
| 1729 |
max_tokens=data.get("max_tokens"),
|
| 1730 |
include_thinking=include_thinking,
|
|
|
|
| 1731 |
**extra,
|
| 1732 |
)
|
| 1733 |
yield f"data: {json.dumps({'chunk': result})}\n\n"
|
|
@@ -1772,6 +1903,7 @@ def list_models():
|
|
| 1772 |
def openai_compat():
|
| 1773 |
if freq.method == "OPTIONS":
|
| 1774 |
return "", 200
|
|
|
|
| 1775 |
data = freq.get_json(force=True, silent=True) or {}
|
| 1776 |
messages = data.get("messages", [])
|
| 1777 |
do_stream = data.get("stream", False)
|
|
@@ -1783,20 +1915,38 @@ def openai_compat():
|
|
| 1783 |
if model_id not in MODEL_REGISTRY:
|
| 1784 |
return jsonify({
|
| 1785 |
"error": {
|
| 1786 |
-
"message": f"Model '{model_id}' not found",
|
| 1787 |
"type": "invalid_request_error",
|
|
|
|
| 1788 |
}
|
| 1789 |
}), 404
|
|
|
|
| 1790 |
if not messages:
|
| 1791 |
return jsonify({"error": {"message": "messages required"}}), 400
|
| 1792 |
|
| 1793 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1794 |
for msg in messages:
|
| 1795 |
-
|
| 1796 |
-
|
| 1797 |
-
|
| 1798 |
-
|
| 1799 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1800 |
return jsonify({"error": {"message": "No user message"}}), 400
|
| 1801 |
|
| 1802 |
rid = f"chatcmpl-{uuid.uuid4().hex[:29]}"
|
|
@@ -1804,15 +1954,21 @@ def openai_compat():
|
|
| 1804 |
client = pool.acquire()
|
| 1805 |
client.new_conversation(system_prompt, model_id)
|
| 1806 |
|
|
|
|
| 1807 |
for msg in messages[:-1]:
|
| 1808 |
role = msg.get("role")
|
| 1809 |
content = msg.get("content", "")
|
| 1810 |
if role in ("user", "assistant") and content:
|
| 1811 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1812 |
|
| 1813 |
mdef = MODEL_REGISTRY[model_id]
|
| 1814 |
|
| 1815 |
-
# Extra params
|
| 1816 |
extra = {}
|
| 1817 |
if model_id == "command-a-reasoning" and "thinking_budget" in data:
|
| 1818 |
extra["thinking_budget"] = data["thinking_budget"]
|
|
@@ -1825,14 +1981,15 @@ def openai_compat():
|
|
| 1825 |
for chunk in client.send_message(
|
| 1826 |
user_msg, stream=True, model=model_id,
|
| 1827 |
temperature=temperature, max_tokens=max_tokens,
|
| 1828 |
-
include_thinking=include_thinking,
|
|
|
|
| 1829 |
):
|
| 1830 |
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
|
| 1831 |
else:
|
| 1832 |
result = client.send_message(
|
| 1833 |
user_msg, model=model_id, temperature=temperature,
|
| 1834 |
-
max_tokens=max_tokens,
|
| 1835 |
-
|
| 1836 |
)
|
| 1837 |
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': result}, 'finish_reason': None}]})}\n\n"
|
| 1838 |
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
|
|
@@ -1845,7 +2002,8 @@ def openai_compat():
|
|
| 1845 |
|
| 1846 |
result = client.send_message(
|
| 1847 |
user_msg, model=model_id, temperature=temperature,
|
| 1848 |
-
max_tokens=max_tokens, include_thinking=include_thinking,
|
|
|
|
| 1849 |
)
|
| 1850 |
return jsonify({
|
| 1851 |
"id": rid,
|
|
@@ -1943,7 +2101,7 @@ def init_model_ep():
|
|
| 1943 |
|
| 1944 |
if __name__ == "__main__":
|
| 1945 |
port = int(os.environ.get("PORT", 7860))
|
| 1946 |
-
log.info(f"Starting
|
| 1947 |
log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
|
| 1948 |
log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
|
| 1949 |
for mid, mdef in MODEL_REGISTRY.items():
|
|
@@ -1952,6 +2110,7 @@ if __name__ == "__main__":
|
|
| 1952 |
if mdef.lb_enabled
|
| 1953 |
else "LB OFF (single instance)"
|
| 1954 |
)
|
|
|
|
| 1955 |
beta_str = " [BETA]" if mdef.is_beta else ""
|
| 1956 |
-
log.info(f" {mid}: {lb_str}{beta_str}")
|
| 1957 |
app.run(host="0.0.0.0", port=port, threaded=True)
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Multi-Model AI API β HuggingFace Spaces Edition
|
| 4 |
+
With load balancing, 10 req/s rate limiting, vision support, and multimodal fixes.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
import re, os, json, uuid, time, random, string, logging, threading, base64
|
| 8 |
from abc import ABC, abstractmethod
|
| 9 |
from collections import deque
|
| 10 |
from dataclasses import dataclass, field
|
| 11 |
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
|
| 12 |
+
from io import BytesIO
|
| 13 |
|
| 14 |
import requests
|
| 15 |
from flask import Flask, request as freq, jsonify, Response, stream_with_context
|
| 16 |
|
| 17 |
try:
|
| 18 |
+
from gradio_client import Client as GradioClient, handle_file
|
| 19 |
HAS_GRADIO_CLIENT = True
|
| 20 |
except ImportError:
|
| 21 |
HAS_GRADIO_CLIENT = False
|
|
|
|
| 24 |
# CONFIG & CONSTANTS
|
| 25 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
|
| 27 |
+
VERSION = "3.0.0-hf-lb"
|
| 28 |
APP_NAME = "Multi-Model-AI-API"
|
| 29 |
DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
|
| 30 |
DEFAULT_MODEL = "gpt-oss-120b"
|
|
|
|
| 39 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
|
| 40 |
]
|
| 41 |
|
| 42 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
# MULTIMODAL HELPERS
|
| 44 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
|
| 46 |
+
def extract_text_and_images(content: Any) -> Tuple[str, List[str]]:
|
| 47 |
+
"""
|
| 48 |
+
Parse OpenAI-style multimodal content.
|
| 49 |
+
Returns (text, [base64_or_url, ...])
|
| 50 |
+
Handles: str, list of {type, text/image_url}
|
| 51 |
+
"""
|
| 52 |
+
if content is None:
|
| 53 |
+
return "", []
|
| 54 |
+
if isinstance(content, str):
|
| 55 |
+
return content.strip(), []
|
| 56 |
+
|
| 57 |
+
texts: List[str] = []
|
| 58 |
+
images: List[str] = []
|
| 59 |
+
|
| 60 |
+
if isinstance(content, list):
|
| 61 |
+
for block in content:
|
| 62 |
+
if not isinstance(block, dict):
|
| 63 |
+
texts.append(str(block))
|
| 64 |
+
continue
|
| 65 |
+
btype = block.get("type", "")
|
| 66 |
+
if btype == "text":
|
| 67 |
+
t = block.get("text", "")
|
| 68 |
+
if t:
|
| 69 |
+
texts.append(t)
|
| 70 |
+
elif btype == "image_url":
|
| 71 |
+
img = block.get("image_url", {})
|
| 72 |
+
url = img.get("url", "") if isinstance(img, dict) else str(img)
|
| 73 |
+
if url:
|
| 74 |
+
images.append(url)
|
| 75 |
+
elif btype == "image":
|
| 76 |
+
# Alternative format
|
| 77 |
+
src = block.get("source", {})
|
| 78 |
+
if isinstance(src, dict):
|
| 79 |
+
data = src.get("data", "")
|
| 80 |
+
if data:
|
| 81 |
+
media = src.get("media_type", "image/jpeg")
|
| 82 |
+
images.append(f"data:{media};base64,{data}")
|
| 83 |
+
|
| 84 |
+
return " ".join(texts).strip(), images
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def decode_image_to_bytes(image_url: str) -> Optional[Tuple[bytes, str]]:
|
| 88 |
+
"""Convert image URL or data URI to (bytes, media_type)."""
|
| 89 |
+
try:
|
| 90 |
+
if image_url.startswith("data:"):
|
| 91 |
+
# data:image/jpeg;base64,/9j/...
|
| 92 |
+
header, data = image_url.split(",", 1)
|
| 93 |
+
media_type = header.split(";")[0].split(":")[1]
|
| 94 |
+
return base64.b64decode(data), media_type
|
| 95 |
+
else:
|
| 96 |
+
# Remote URL
|
| 97 |
+
r = requests.get(image_url, timeout=15)
|
| 98 |
+
r.raise_for_status()
|
| 99 |
+
ct = r.headers.get("content-type", "image/jpeg").split(";")[0]
|
| 100 |
+
return r.content, ct
|
| 101 |
+
except Exception as e:
|
| 102 |
+
log.warning(f"Failed to decode image: {e}")
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def save_image_temp(image_url: str) -> Optional[str]:
|
| 107 |
+
"""Save image to a temp file and return path (for gradio_client)."""
|
| 108 |
+
import tempfile
|
| 109 |
+
result = decode_image_to_bytes(image_url)
|
| 110 |
+
if not result:
|
| 111 |
+
return None
|
| 112 |
+
data, media_type = result
|
| 113 |
+
ext = media_type.split("/")[-1].replace("jpeg", "jpg")
|
| 114 |
+
with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
|
| 115 |
+
f.write(data)
|
| 116 |
+
return f.name
|
| 117 |
+
|
| 118 |
+
|
| 119 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 120 |
# MODEL REGISTRY
|
| 121 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 143 |
clean_analysis: bool = False
|
| 144 |
lb_pool_size: int = 2
|
| 145 |
lb_enabled: bool = True
|
| 146 |
+
is_beta: bool = False
|
| 147 |
+
|
| 148 |
|
| 149 |
MODEL_REGISTRY: Dict[str, ModelDef] = {}
|
| 150 |
|
|
|
|
| 180 |
supports_temperature=False, supports_streaming=False, supports_history=False,
|
| 181 |
supports_thinking=False, max_tokens_default=700,
|
| 182 |
extra_params={"max_new_tokens": 700},
|
| 183 |
+
lb_pool_size=1, lb_enabled=False,
|
| 184 |
))
|
|
|
|
| 185 |
register_model(ModelDef(
|
| 186 |
model_id="command-a-reasoning", display_name="Cohere Command-A Reasoning",
|
| 187 |
provider_type="gradio_client", space_id="CohereLabs/command-a-reasoning",
|
|
|
|
| 231 |
supports_thinking=False, max_tokens_default=4096,
|
| 232 |
lb_pool_size=2, lb_enabled=True,
|
| 233 |
))
|
|
|
|
| 234 |
register_model(ModelDef(
|
| 235 |
model_id="qwen2.5-coder", display_name="Qwen2.5-Coder Artifacts (BETA)",
|
| 236 |
provider_type="gradio_client", space_id="Qwen/Qwen2.5-Coder-Artifacts",
|
|
|
|
| 271 |
rate_limit_burst: int = 15
|
| 272 |
pool_size: int = 2
|
| 273 |
max_history_messages: int = 50
|
| 274 |
+
max_message_length: int = 32000
|
| 275 |
default_temperature: float = 0.7
|
| 276 |
include_thinking: bool = True
|
| 277 |
log_sse_raw: bool = False
|
|
|
|
| 453 |
|
| 454 |
@classmethod
|
| 455 |
def extract_qwen_coder_text(cls, result: Any) -> str:
|
|
|
|
|
|
|
| 456 |
if result is None:
|
| 457 |
return ""
|
| 458 |
if isinstance(result, str):
|
| 459 |
return result.strip()
|
| 460 |
if isinstance(result, tuple):
|
|
|
|
|
|
|
| 461 |
if len(result) >= 1 and isinstance(result[0], str):
|
| 462 |
text = result[0].strip()
|
| 463 |
if text:
|
| 464 |
return text
|
|
|
|
| 465 |
if len(result) >= 2 and isinstance(result[1], str):
|
| 466 |
return result[1].strip()
|
| 467 |
if isinstance(result, (list, dict)):
|
|
|
|
| 640 |
metrics = Metrics()
|
| 641 |
|
| 642 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 643 |
+
# RATE LIMITER β token bucket (10 req/s)
|
| 644 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 645 |
|
| 646 |
class RateLimiter:
|
|
|
|
| 785 |
|
| 786 |
@abstractmethod
|
| 787 |
def generate(self, message: str, history=None, system_prompt=None,
|
| 788 |
+
temperature=None, max_tokens=None, images=None, **kwargs) -> str: ...
|
| 789 |
|
| 790 |
def generate_stream(self, message: str, **kwargs) -> Generator[str, None, None]:
|
| 791 |
yield self.generate(message, **kwargs)
|
|
|
|
| 867 |
return False
|
| 868 |
|
| 869 |
def generate(self, message, history=None, system_prompt=None,
|
| 870 |
+
temperature=None, max_tokens=None, images=None, **kw):
|
| 871 |
if not self.ready:
|
| 872 |
self.initialize()
|
| 873 |
sys_p = system_prompt or self.config.default_system_prompt
|
|
|
|
| 920 |
if self.model_def.clean_analysis else full)
|
| 921 |
|
| 922 |
def generate_stream(self, message, history=None, system_prompt=None,
|
| 923 |
+
temperature=None, max_tokens=None, images=None, **kw):
|
| 924 |
if not self.ready:
|
| 925 |
self.initialize()
|
| 926 |
sys_p = system_prompt or self.config.default_system_prompt
|
|
|
|
| 1005 |
return False
|
| 1006 |
|
| 1007 |
def generate(self, message, history=None, system_prompt=None,
|
| 1008 |
+
temperature=None, max_tokens=None, images=None, **kw):
|
| 1009 |
if not self.ready:
|
| 1010 |
self.initialize()
|
| 1011 |
if not self._client:
|
| 1012 |
raise APIError(f"{self.model_def.model_id} not initialized")
|
| 1013 |
|
| 1014 |
mid = self.model_def.model_id
|
| 1015 |
+
images = images or []
|
| 1016 |
+
|
| 1017 |
try:
|
| 1018 |
if mid == "command-a-vision":
|
| 1019 |
max_new = (max_tokens
|
| 1020 |
or self.model_def.extra_params.get("max_new_tokens", 700))
|
| 1021 |
+
# Build multimodal message
|
| 1022 |
+
msg_payload: Any
|
| 1023 |
+
if images:
|
| 1024 |
+
img_path = save_image_temp(images[0])
|
| 1025 |
+
if img_path:
|
| 1026 |
+
msg_payload = {"text": message, "files": [handle_file(img_path)]}
|
| 1027 |
+
else:
|
| 1028 |
+
msg_payload = {"text": message, "files": []}
|
| 1029 |
+
else:
|
| 1030 |
+
msg_payload = {"text": message, "files": []}
|
| 1031 |
result = self._client.predict(
|
| 1032 |
+
message=msg_payload,
|
| 1033 |
max_new_tokens=max_new,
|
| 1034 |
api_name=self.model_def.api_name,
|
| 1035 |
)
|
|
|
|
| 1044 |
)
|
| 1045 |
|
| 1046 |
elif mid == "command-a-reasoning":
|
|
|
|
| 1047 |
thinking_budget = kw.get(
|
| 1048 |
"thinking_budget",
|
| 1049 |
self.model_def.extra_params.get("thinking_budget", 500),
|
|
|
|
| 1062 |
or self.model_def.extra_params.get("max_tokens", 12800))
|
| 1063 |
top_p = kw.get("top_p",
|
| 1064 |
self.model_def.extra_params.get("top_p", 0.9))
|
| 1065 |
+
# Vision support
|
| 1066 |
+
if images:
|
| 1067 |
+
img_path = save_image_temp(images[0])
|
| 1068 |
+
files = [handle_file(img_path)] if img_path else []
|
| 1069 |
+
else:
|
| 1070 |
+
files = []
|
| 1071 |
result = self._client.predict(
|
| 1072 |
+
message={"text": message, "files": files},
|
| 1073 |
max_tokens=max_tok, temperature=temp, top_p=top_p,
|
| 1074 |
api_name=self.model_def.api_name,
|
| 1075 |
)
|
|
|
|
| 1108 |
return ResponseCleaner.extract_chatgpt_text(result)
|
| 1109 |
|
| 1110 |
elif mid == "qwen3-vl":
|
| 1111 |
+
# Vision support
|
| 1112 |
+
if images:
|
| 1113 |
+
img_path = save_image_temp(images[0])
|
| 1114 |
+
files = [handle_file(img_path)] if img_path else []
|
| 1115 |
+
result = self._client.predict(
|
| 1116 |
+
input_value={"files": files, "text": message},
|
| 1117 |
+
api_name="/add_message",
|
| 1118 |
+
)
|
| 1119 |
+
else:
|
| 1120 |
+
result = self._client.predict(
|
| 1121 |
+
input_value={"files": None, "text": message},
|
| 1122 |
+
api_name="/add_message",
|
| 1123 |
+
)
|
| 1124 |
return ResponseCleaner.extract_qwen_text(result)
|
| 1125 |
|
| 1126 |
elif mid == "qwen2.5-coder":
|
|
|
|
| 1127 |
sys_override = self.model_def.extra_params.get(
|
| 1128 |
"system_prompt_override", ""
|
| 1129 |
)
|
|
|
|
| 1159 |
raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
|
| 1160 |
|
| 1161 |
def _extract_reasoning(self, result: Any) -> str:
|
|
|
|
|
|
|
| 1162 |
if result is None:
|
| 1163 |
return ""
|
| 1164 |
if isinstance(result, str):
|
| 1165 |
return result.strip()
|
| 1166 |
if isinstance(result, dict):
|
|
|
|
| 1167 |
for key in ("response", "output", "answer", "text", "content", "result"):
|
| 1168 |
if key in result:
|
| 1169 |
val = result[key]
|
| 1170 |
if isinstance(val, str):
|
| 1171 |
return val.strip()
|
| 1172 |
return str(val)
|
|
|
|
| 1173 |
thinking = result.get("thinking", "")
|
| 1174 |
response = result.get("response", result.get("output", ""))
|
| 1175 |
if thinking and response:
|
| 1176 |
return f"<thinking>\n{thinking}\n</thinking>\n{response}"
|
| 1177 |
if response:
|
| 1178 |
return str(response).strip()
|
|
|
|
| 1179 |
return json.dumps(result, ensure_ascii=False, indent=2)
|
| 1180 |
if isinstance(result, (list, tuple)):
|
| 1181 |
if len(result) == 1:
|
| 1182 |
return str(result[0]).strip()
|
|
|
|
| 1183 |
texts = []
|
| 1184 |
for item in result:
|
| 1185 |
if isinstance(item, str) and item.strip():
|
|
|
|
| 1211 |
return ResponseCleaner.clean_glm(str(result), include_thinking)
|
| 1212 |
|
| 1213 |
|
|
|
|
| 1214 |
def create_provider(model_id: str, config: Config,
|
| 1215 |
instance_id: int = 0) -> ModelProvider:
|
| 1216 |
if model_id not in MODEL_REGISTRY:
|
|
|
|
| 1343 |
inst.record_failure()
|
| 1344 |
log.warning(
|
| 1345 |
f"[LB] Failover instance {inst.instance_id} "
|
| 1346 |
+
f"for '{self.model_id}' failed: {e}"
|
| 1347 |
)
|
| 1348 |
|
| 1349 |
raise APIError(
|
|
|
|
| 1426 |
}
|
| 1427 |
|
| 1428 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1429 |
+
# MULTI-MODEL CLIENT
|
| 1430 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1431 |
|
| 1432 |
class MultiModelClient:
|
|
|
|
| 1505 |
|
| 1506 |
def send_message(
|
| 1507 |
self,
|
| 1508 |
+
message: Any, # str OR list (multimodal)
|
| 1509 |
*,
|
| 1510 |
stream: bool = False,
|
| 1511 |
model: Optional[str] = None,
|
|
|
|
| 1514 |
temperature: Optional[float] = None,
|
| 1515 |
max_tokens: Optional[int] = None,
|
| 1516 |
include_thinking: Optional[bool] = None,
|
| 1517 |
+
images: Optional[List[str]] = None,
|
| 1518 |
**kwargs,
|
| 1519 |
) -> Union[str, Generator]:
|
| 1520 |
model_id = model or self._current_model
|
| 1521 |
if model_id not in MODEL_REGISTRY:
|
| 1522 |
raise ModelNotFoundError(model_id)
|
| 1523 |
mdef = MODEL_REGISTRY[model_id]
|
| 1524 |
+
|
| 1525 |
+
# ββ Normalise multimodal content ββββββββββββββββββββββ
|
| 1526 |
+
if isinstance(message, list):
|
| 1527 |
+
text, extracted_images = extract_text_and_images(message)
|
| 1528 |
+
if not images:
|
| 1529 |
+
images = extracted_images
|
| 1530 |
+
message = text
|
| 1531 |
+
|
| 1532 |
+
if isinstance(message, str):
|
| 1533 |
+
message = message.strip()
|
| 1534 |
+
else:
|
| 1535 |
+
message = str(message).strip()
|
| 1536 |
+
|
| 1537 |
+
if not message and not images:
|
| 1538 |
raise APIError("Empty message", "INVALID_INPUT", 400)
|
| 1539 |
if len(message) > self.config.max_message_length:
|
| 1540 |
raise APIError("Message too long", "INVALID_INPUT", 400)
|
|
|
|
| 1582 |
system_prompt=eff_sys,
|
| 1583 |
temperature=eff_temp,
|
| 1584 |
max_tokens=max_tokens,
|
| 1585 |
+
images=images,
|
| 1586 |
**extra,
|
| 1587 |
)
|
| 1588 |
return self._wrap_stream(gen, conv, start, model_id)
|
|
|
|
| 1594 |
system_prompt=eff_sys,
|
| 1595 |
temperature=eff_temp,
|
| 1596 |
max_tokens=max_tokens,
|
| 1597 |
+
images=images,
|
| 1598 |
**extra,
|
| 1599 |
)
|
| 1600 |
dur = (time.monotonic() - start) * 1000
|
|
|
|
| 1686 |
return c
|
| 1687 |
|
| 1688 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1689 |
+
# ALIAS RESOLVER
|
| 1690 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1691 |
|
| 1692 |
ALIASES = {
|
|
|
|
| 1707 |
|
| 1708 |
|
| 1709 |
def resolve_alias(model_id: str) -> str:
|
| 1710 |
+
if not model_id:
|
| 1711 |
+
return config.default_model
|
| 1712 |
return ALIASES.get(model_id.lower(), model_id)
|
| 1713 |
|
| 1714 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1741 |
"name": APP_NAME,
|
| 1742 |
"version": VERSION,
|
| 1743 |
"default_model": config.default_model,
|
| 1744 |
+
"features": ["load_balancing", "10_req_per_second_limit", "failover", "vision"],
|
| 1745 |
"models": list(MODEL_REGISTRY.keys()),
|
| 1746 |
"beta_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.is_beta],
|
| 1747 |
+
"vision_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.supports_vision],
|
| 1748 |
"endpoints": {
|
| 1749 |
"POST /chat": "Chat with any model",
|
| 1750 |
"POST /chat/stream": "Streaming chat",
|
| 1751 |
+
"POST /v1/chat/completions": "OpenAI-compatible (supports vision)",
|
| 1752 |
"GET /v1/models": "List models",
|
| 1753 |
"POST /models/init": "Init a model",
|
| 1754 |
+
"GET /health": "Health check",
|
| 1755 |
"GET /metrics": "Metrics",
|
| 1756 |
+
"GET /lb/status": "Load balancer status",
|
| 1757 |
},
|
| 1758 |
})
|
| 1759 |
|
|
|
|
| 1761 |
@app.route("/chat", methods=["POST"])
|
| 1762 |
def chat():
|
| 1763 |
data = freq.get_json(force=True, silent=True) or {}
|
| 1764 |
+
raw_message = data.get("message", "")
|
| 1765 |
+
images = data.get("images", [])
|
| 1766 |
+
|
| 1767 |
+
# Support multimodal content directly in message field
|
| 1768 |
+
if isinstance(raw_message, list):
|
| 1769 |
+
text, extracted = extract_text_and_images(raw_message)
|
| 1770 |
+
images = images or extracted
|
| 1771 |
+
message = text
|
| 1772 |
+
else:
|
| 1773 |
+
message = str(raw_message).strip()
|
| 1774 |
+
|
| 1775 |
+
if not message and not images:
|
| 1776 |
return jsonify({"ok": False, "error": "'message' required"}), 400
|
| 1777 |
+
|
| 1778 |
model_id = resolve_alias(data.get("model", config.default_model))
|
| 1779 |
include_thinking = data.get("include_thinking", config.include_thinking)
|
| 1780 |
client = pool.acquire()
|
| 1781 |
if data.get("new_conversation"):
|
| 1782 |
client.new_conversation(data.get("system_prompt"), model_id)
|
| 1783 |
|
|
|
|
| 1784 |
extra = {}
|
| 1785 |
if model_id == "command-a-reasoning" and "thinking_budget" in data:
|
| 1786 |
extra["thinking_budget"] = data["thinking_budget"]
|
|
|
|
| 1791 |
temperature=data.get("temperature"),
|
| 1792 |
max_tokens=data.get("max_tokens"),
|
| 1793 |
include_thinking=include_thinking,
|
| 1794 |
+
images=images or None,
|
| 1795 |
**extra,
|
| 1796 |
)
|
| 1797 |
thinking, clean = ThinkingParser.split(result)
|
|
|
|
| 1813 |
@app.route("/chat/stream", methods=["POST"])
|
| 1814 |
def chat_stream():
|
| 1815 |
data = freq.get_json(force=True, silent=True) or {}
|
| 1816 |
+
raw_message = data.get("message", "")
|
| 1817 |
+
images = data.get("images", [])
|
| 1818 |
+
|
| 1819 |
+
if isinstance(raw_message, list):
|
| 1820 |
+
text, extracted = extract_text_and_images(raw_message)
|
| 1821 |
+
images = images or extracted
|
| 1822 |
+
message = text
|
| 1823 |
+
else:
|
| 1824 |
+
message = str(raw_message).strip()
|
| 1825 |
+
|
| 1826 |
+
if not message and not images:
|
| 1827 |
return jsonify({"ok": False, "error": "'message' required"}), 400
|
| 1828 |
+
|
| 1829 |
model_id = resolve_alias(data.get("model", config.default_model))
|
| 1830 |
include_thinking = data.get("include_thinking", config.include_thinking)
|
| 1831 |
client = pool.acquire()
|
|
|
|
| 1847 |
temperature=data.get("temperature"),
|
| 1848 |
max_tokens=data.get("max_tokens"),
|
| 1849 |
include_thinking=include_thinking,
|
| 1850 |
+
images=images or None,
|
| 1851 |
**extra,
|
| 1852 |
):
|
| 1853 |
yield f"data: {json.dumps({'chunk': chunk})}\n\n"
|
|
|
|
| 1858 |
temperature=data.get("temperature"),
|
| 1859 |
max_tokens=data.get("max_tokens"),
|
| 1860 |
include_thinking=include_thinking,
|
| 1861 |
+
images=images or None,
|
| 1862 |
**extra,
|
| 1863 |
)
|
| 1864 |
yield f"data: {json.dumps({'chunk': result})}\n\n"
|
|
|
|
| 1903 |
def openai_compat():
|
| 1904 |
if freq.method == "OPTIONS":
|
| 1905 |
return "", 200
|
| 1906 |
+
|
| 1907 |
data = freq.get_json(force=True, silent=True) or {}
|
| 1908 |
messages = data.get("messages", [])
|
| 1909 |
do_stream = data.get("stream", False)
|
|
|
|
| 1915 |
if model_id not in MODEL_REGISTRY:
|
| 1916 |
return jsonify({
|
| 1917 |
"error": {
|
| 1918 |
+
"message": f"Model '{model_id}' not found. Available: {list(MODEL_REGISTRY.keys())}",
|
| 1919 |
"type": "invalid_request_error",
|
| 1920 |
+
"available_models": list(MODEL_REGISTRY.keys()),
|
| 1921 |
}
|
| 1922 |
}), 404
|
| 1923 |
+
|
| 1924 |
if not messages:
|
| 1925 |
return jsonify({"error": {"message": "messages required"}}), 400
|
| 1926 |
|
| 1927 |
+
# ββ Extract user message, system prompt, and images βββββββ
|
| 1928 |
+
user_msg: str = ""
|
| 1929 |
+
system_prompt: Optional[str] = None
|
| 1930 |
+
images: List[str] = []
|
| 1931 |
+
|
| 1932 |
for msg in messages:
|
| 1933 |
+
role = msg.get("role", "")
|
| 1934 |
+
content = msg.get("content", "")
|
| 1935 |
+
|
| 1936 |
+
if role == "system":
|
| 1937 |
+
system_prompt = content if isinstance(content, str) else str(content)
|
| 1938 |
+
|
| 1939 |
+
if role == "user":
|
| 1940 |
+
if isinstance(content, list):
|
| 1941 |
+
text, imgs = extract_text_and_images(content)
|
| 1942 |
+
user_msg = text
|
| 1943 |
+
images.extend(imgs)
|
| 1944 |
+
elif isinstance(content, str):
|
| 1945 |
+
user_msg = content
|
| 1946 |
+
else:
|
| 1947 |
+
user_msg = str(content)
|
| 1948 |
+
|
| 1949 |
+
if not user_msg and not images:
|
| 1950 |
return jsonify({"error": {"message": "No user message"}}), 400
|
| 1951 |
|
| 1952 |
rid = f"chatcmpl-{uuid.uuid4().hex[:29]}"
|
|
|
|
| 1954 |
client = pool.acquire()
|
| 1955 |
client.new_conversation(system_prompt, model_id)
|
| 1956 |
|
| 1957 |
+
# Replay history (all but the last user message)
|
| 1958 |
for msg in messages[:-1]:
|
| 1959 |
role = msg.get("role")
|
| 1960 |
content = msg.get("content", "")
|
| 1961 |
if role in ("user", "assistant") and content:
|
| 1962 |
+
text = (
|
| 1963 |
+
extract_text_and_images(content)[0]
|
| 1964 |
+
if isinstance(content, list)
|
| 1965 |
+
else str(content)
|
| 1966 |
+
)
|
| 1967 |
+
if text:
|
| 1968 |
+
client.active_conversation.add_message(role, text)
|
| 1969 |
|
| 1970 |
mdef = MODEL_REGISTRY[model_id]
|
| 1971 |
|
|
|
|
| 1972 |
extra = {}
|
| 1973 |
if model_id == "command-a-reasoning" and "thinking_budget" in data:
|
| 1974 |
extra["thinking_budget"] = data["thinking_budget"]
|
|
|
|
| 1981 |
for chunk in client.send_message(
|
| 1982 |
user_msg, stream=True, model=model_id,
|
| 1983 |
temperature=temperature, max_tokens=max_tokens,
|
| 1984 |
+
include_thinking=include_thinking,
|
| 1985 |
+
images=images or None, **extra,
|
| 1986 |
):
|
| 1987 |
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
|
| 1988 |
else:
|
| 1989 |
result = client.send_message(
|
| 1990 |
user_msg, model=model_id, temperature=temperature,
|
| 1991 |
+
max_tokens=max_tokens, include_thinking=include_thinking,
|
| 1992 |
+
images=images or None, **extra,
|
| 1993 |
)
|
| 1994 |
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': result}, 'finish_reason': None}]})}\n\n"
|
| 1995 |
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
|
|
|
|
| 2002 |
|
| 2003 |
result = client.send_message(
|
| 2004 |
user_msg, model=model_id, temperature=temperature,
|
| 2005 |
+
max_tokens=max_tokens, include_thinking=include_thinking,
|
| 2006 |
+
images=images or None, **extra,
|
| 2007 |
)
|
| 2008 |
return jsonify({
|
| 2009 |
"id": rid,
|
|
|
|
| 2101 |
|
| 2102 |
if __name__ == "__main__":
|
| 2103 |
port = int(os.environ.get("PORT", 7860))
|
| 2104 |
+
log.info(f"Starting {APP_NAME} v{VERSION} on port {port}")
|
| 2105 |
log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
|
| 2106 |
log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
|
| 2107 |
for mid, mdef in MODEL_REGISTRY.items():
|
|
|
|
| 2110 |
if mdef.lb_enabled
|
| 2111 |
else "LB OFF (single instance)"
|
| 2112 |
)
|
| 2113 |
+
vision_str = " [VISION]" if mdef.supports_vision else ""
|
| 2114 |
beta_str = " [BETA]" if mdef.is_beta else ""
|
| 2115 |
+
log.info(f" {mid}: {lb_str}{vision_str}{beta_str}")
|
| 2116 |
app.run(host="0.0.0.0", port=port, threaded=True)
|