Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,7 +23,7 @@ except ImportError:
|
|
| 23 |
# CONFIG & CONSTANTS
|
| 24 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
|
| 26 |
-
VERSION = "2.
|
| 27 |
APP_NAME = "Multi-Model-AI-API"
|
| 28 |
DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
|
| 29 |
DEFAULT_MODEL = "gpt-oss-120b"
|
|
@@ -63,9 +63,9 @@ class ModelDef:
|
|
| 63 |
api_name: Optional[str] = None
|
| 64 |
extra_params: Dict[str, Any] = field(default_factory=dict)
|
| 65 |
clean_analysis: bool = False
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
|
| 70 |
MODEL_REGISTRY: Dict[str, ModelDef] = {}
|
| 71 |
|
|
@@ -103,6 +103,17 @@ def _init_registry():
|
|
| 103 |
extra_params={"max_new_tokens": 700},
|
| 104 |
lb_pool_size=1, lb_enabled=False, # NO load balancing for translate
|
| 105 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
register_model(ModelDef(
|
| 107 |
model_id="minimax-vl-01", display_name="MiniMax VL-01",
|
| 108 |
provider_type="gradio_client", space_id="MiniMaxAI/MiniMax-VL-01",
|
|
@@ -142,6 +153,27 @@ def _init_registry():
|
|
| 142 |
supports_thinking=False, max_tokens_default=4096,
|
| 143 |
lb_pool_size=2, lb_enabled=True,
|
| 144 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
_init_registry()
|
|
@@ -158,8 +190,8 @@ class Config:
|
|
| 158 |
max_retries: int = 3
|
| 159 |
retry_backoff_base: float = 1.5
|
| 160 |
retry_jitter: float = 0.5
|
| 161 |
-
rate_limit_rps: int = 10
|
| 162 |
-
rate_limit_burst: int = 15
|
| 163 |
pool_size: int = 2
|
| 164 |
max_history_messages: int = 50
|
| 165 |
max_message_length: int = 10000
|
|
@@ -342,6 +374,28 @@ class ResponseCleaner:
|
|
| 342 |
return str(chatbot).strip() if chatbot else ""
|
| 343 |
return str(result)
|
| 344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
@classmethod
|
| 346 |
def clean(cls, text: str, model_id: str = "",
|
| 347 |
include_thinking: bool = True) -> str:
|
|
@@ -459,7 +513,6 @@ class Metrics:
|
|
| 459 |
requests_per_model: Dict[str, int] = field(default_factory=dict)
|
| 460 |
_latencies: deque = field(default_factory=lambda: deque(maxlen=1000), repr=False)
|
| 461 |
started_at: float = field(default_factory=time.time)
|
| 462 |
-
# Load balancer metrics
|
| 463 |
lb_total_dispatches: int = 0
|
| 464 |
lb_failovers: int = 0
|
| 465 |
|
|
@@ -519,10 +572,8 @@ metrics = Metrics()
|
|
| 519 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 520 |
|
| 521 |
class RateLimiter:
|
| 522 |
-
"""Token-bucket rate limiter. Default: 10 requests/second with burst."""
|
| 523 |
-
|
| 524 |
def __init__(self, rps: int = 10, burst: int = 15):
|
| 525 |
-
self.rate = float(rps)
|
| 526 |
self.max_tokens = float(burst)
|
| 527 |
self.tokens = float(burst)
|
| 528 |
self.last_refill = time.monotonic()
|
|
@@ -544,7 +595,7 @@ class RateLimiter:
|
|
| 544 |
return True
|
| 545 |
if time.monotonic() >= deadline:
|
| 546 |
return False
|
| 547 |
-
time.sleep(0.05)
|
| 548 |
|
| 549 |
def get_info(self) -> Dict:
|
| 550 |
with self._lock:
|
|
@@ -650,7 +701,6 @@ class ModelProvider(ABC):
|
|
| 650 |
self.instance_id = instance_id
|
| 651 |
self.ready = False
|
| 652 |
self._lock = threading.Lock()
|
| 653 |
-
# Per-instance health tracking
|
| 654 |
self._consecutive_failures = 0
|
| 655 |
self._last_success_time = 0.0
|
| 656 |
self._last_failure_time = 0.0
|
|
@@ -686,20 +736,16 @@ class ModelProvider(ABC):
|
|
| 686 |
|
| 687 |
@property
|
| 688 |
def health_score(self) -> float:
|
| 689 |
-
"""0.0 (worst) to 1.0 (best). Used by load balancer to pick instance."""
|
| 690 |
if not self.ready:
|
| 691 |
return 0.0
|
| 692 |
score = 1.0
|
| 693 |
-
# Penalise consecutive failures
|
| 694 |
score -= min(self._consecutive_failures * 0.2, 0.8)
|
| 695 |
-
# Penalise high avg latency (>10s = bad)
|
| 696 |
if self._latencies:
|
| 697 |
avg = self.avg_latency
|
| 698 |
if avg > 10000:
|
| 699 |
score -= 0.3
|
| 700 |
elif avg > 5000:
|
| 701 |
score -= 0.15
|
| 702 |
-
# Penalise high failure rate
|
| 703 |
if self._total_requests > 5:
|
| 704 |
fail_rate = self._total_failures / self._total_requests
|
| 705 |
score -= fail_rate * 0.4
|
|
@@ -903,6 +949,7 @@ class GradioClientProvider(ModelProvider):
|
|
| 903 |
max_new_tokens=max_new,
|
| 904 |
api_name=self.model_def.api_name,
|
| 905 |
)
|
|
|
|
| 906 |
elif mid == "command-a-translate":
|
| 907 |
max_new = (max_tokens
|
| 908 |
or self.model_def.extra_params.get("max_new_tokens", 700))
|
|
@@ -911,6 +958,20 @@ class GradioClientProvider(ModelProvider):
|
|
| 911 |
max_new_tokens=max_new,
|
| 912 |
api_name=self.model_def.api_name,
|
| 913 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 914 |
elif mid == "minimax-vl-01":
|
| 915 |
temp = (temperature if temperature is not None
|
| 916 |
else self.model_def.default_temperature)
|
|
@@ -923,6 +984,7 @@ class GradioClientProvider(ModelProvider):
|
|
| 923 |
max_tokens=max_tok, temperature=temp, top_p=top_p,
|
| 924 |
api_name=self.model_def.api_name,
|
| 925 |
)
|
|
|
|
| 926 |
elif mid == "glm-4.5":
|
| 927 |
sys_p = system_prompt or self.config.default_system_prompt
|
| 928 |
temp = (temperature if temperature is not None
|
|
@@ -937,6 +999,7 @@ class GradioClientProvider(ModelProvider):
|
|
| 937 |
api_name=self.model_def.api_name,
|
| 938 |
)
|
| 939 |
return self._extract_glm(result, include)
|
|
|
|
| 940 |
elif mid == "chatgpt":
|
| 941 |
temp = (temperature if temperature is not None
|
| 942 |
else self.model_def.default_temperature)
|
|
@@ -954,12 +1017,34 @@ class GradioClientProvider(ModelProvider):
|
|
| 954 |
)
|
| 955 |
self._chat_counter += 1
|
| 956 |
return ResponseCleaner.extract_chatgpt_text(result)
|
|
|
|
| 957 |
elif mid == "qwen3-vl":
|
| 958 |
result = self._client.predict(
|
| 959 |
input_value={"files": None, "text": message},
|
| 960 |
api_name="/add_message",
|
| 961 |
)
|
| 962 |
return ResponseCleaner.extract_qwen_text(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 963 |
else:
|
| 964 |
raise APIError(f"Unknown model handler: {mid}")
|
| 965 |
|
|
@@ -976,6 +1061,45 @@ class GradioClientProvider(ModelProvider):
|
|
| 976 |
except Exception as e:
|
| 977 |
raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
|
| 978 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 979 |
def _extract_glm(self, result, include_thinking: bool = True) -> str:
|
| 980 |
if isinstance(result, tuple) and len(result) >= 1:
|
| 981 |
chatbot = result[0]
|
|
@@ -996,7 +1120,7 @@ class GradioClientProvider(ModelProvider):
|
|
| 996 |
return ResponseCleaner.clean_glm(str(result), include_thinking)
|
| 997 |
|
| 998 |
|
| 999 |
-
# Factory
|
| 1000 |
def create_provider(model_id: str, config: Config,
|
| 1001 |
instance_id: int = 0) -> ModelProvider:
|
| 1002 |
if model_id not in MODEL_REGISTRY:
|
|
@@ -1007,18 +1131,10 @@ def create_provider(model_id: str, config: Config,
|
|
| 1007 |
return GradioClientProvider(mdef, config, instance_id)
|
| 1008 |
|
| 1009 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1010 |
-
# LOAD BALANCER
|
| 1011 |
-
# round-robin + failover
|
| 1012 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1013 |
|
| 1014 |
class LoadBalancedProviderPool:
|
| 1015 |
-
"""
|
| 1016 |
-
Manages multiple provider instances for a single model.
|
| 1017 |
-
Selects the best instance based on health score with
|
| 1018 |
-
weighted-random selection (healthier instances chosen more).
|
| 1019 |
-
Falls back through all instances on failure.
|
| 1020 |
-
"""
|
| 1021 |
-
|
| 1022 |
def __init__(self, model_id: str, config: Config):
|
| 1023 |
self.model_id = model_id
|
| 1024 |
self.config = config
|
|
@@ -1041,7 +1157,6 @@ class LoadBalancedProviderPool:
|
|
| 1041 |
return len(self._instances)
|
| 1042 |
|
| 1043 |
def initialize_all(self) -> int:
|
| 1044 |
-
"""Initialize all instances, return count of successful ones."""
|
| 1045 |
ok = 0
|
| 1046 |
for inst in self._instances:
|
| 1047 |
try:
|
|
@@ -1055,7 +1170,6 @@ class LoadBalancedProviderPool:
|
|
| 1055 |
return ok
|
| 1056 |
|
| 1057 |
def initialize_one(self) -> bool:
|
| 1058 |
-
"""Initialize at least one instance."""
|
| 1059 |
for inst in self._instances:
|
| 1060 |
try:
|
| 1061 |
if inst.initialize():
|
|
@@ -1065,30 +1179,21 @@ class LoadBalancedProviderPool:
|
|
| 1065 |
return False
|
| 1066 |
|
| 1067 |
def _select_instance(self) -> ModelProvider:
|
| 1068 |
-
"""
|
| 1069 |
-
Select best available instance.
|
| 1070 |
-
Strategy: weighted random by health score.
|
| 1071 |
-
If all have equal scores, falls back to round-robin.
|
| 1072 |
-
"""
|
| 1073 |
if len(self._instances) == 1:
|
| 1074 |
return self._instances[0]
|
| 1075 |
|
| 1076 |
with self._lock:
|
| 1077 |
-
# Collect health scores
|
| 1078 |
scored = []
|
| 1079 |
for inst in self._instances:
|
| 1080 |
score = inst.health_score
|
| 1081 |
-
# Give a minimum weight so unhealthy instances can still recover
|
| 1082 |
scored.append((inst, max(score, 0.05)))
|
| 1083 |
|
| 1084 |
total_weight = sum(s for _, s in scored)
|
| 1085 |
if total_weight <= 0:
|
| 1086 |
-
# All dead, just round-robin
|
| 1087 |
inst = self._instances[self._rr_index % len(self._instances)]
|
| 1088 |
self._rr_index += 1
|
| 1089 |
return inst
|
| 1090 |
|
| 1091 |
-
# Weighted random selection
|
| 1092 |
r = random.uniform(0, total_weight)
|
| 1093 |
cumulative = 0.0
|
| 1094 |
for inst, weight in scored:
|
|
@@ -1096,29 +1201,21 @@ class LoadBalancedProviderPool:
|
|
| 1096 |
if r <= cumulative:
|
| 1097 |
return inst
|
| 1098 |
|
| 1099 |
-
# Fallback
|
| 1100 |
return scored[-1][0]
|
| 1101 |
|
| 1102 |
def _get_ordered_instances(self) -> List[ModelProvider]:
|
| 1103 |
-
"""Return instances ordered by health score (best first)."""
|
| 1104 |
return sorted(self._instances, key=lambda p: p.health_score, reverse=True)
|
| 1105 |
|
| 1106 |
def execute(self, fn_name: str, **kwargs) -> Any:
|
| 1107 |
-
"""
|
| 1108 |
-
Execute a provider method with automatic failover.
|
| 1109 |
-
Tries the best instance first, fails over to others.
|
| 1110 |
-
"""
|
| 1111 |
primary = self._select_instance()
|
| 1112 |
metrics.record_lb_dispatch()
|
| 1113 |
|
| 1114 |
-
# Ensure primary is ready
|
| 1115 |
if not primary.ready:
|
| 1116 |
try:
|
| 1117 |
primary.initialize()
|
| 1118 |
except Exception:
|
| 1119 |
pass
|
| 1120 |
|
| 1121 |
-
# Try primary
|
| 1122 |
start = time.monotonic()
|
| 1123 |
try:
|
| 1124 |
result = self._call_provider(primary, fn_name, **kwargs)
|
|
@@ -1132,7 +1229,6 @@ class LoadBalancedProviderPool:
|
|
| 1132 |
f"'{self.model_id}' failed: {primary_err}"
|
| 1133 |
)
|
| 1134 |
|
| 1135 |
-
# Failover through remaining instances
|
| 1136 |
for inst in self._get_ordered_instances():
|
| 1137 |
if inst is primary:
|
| 1138 |
continue
|
|
@@ -1166,11 +1262,6 @@ class LoadBalancedProviderPool:
|
|
| 1166 |
)
|
| 1167 |
|
| 1168 |
def execute_stream(self, **kwargs) -> Generator[str, None, None]:
|
| 1169 |
-
"""
|
| 1170 |
-
Execute streaming with failover.
|
| 1171 |
-
Since generators can't easily be retried mid-stream,
|
| 1172 |
-
we do failover only on initial connection failure.
|
| 1173 |
-
"""
|
| 1174 |
primary = self._select_instance()
|
| 1175 |
metrics.record_lb_dispatch()
|
| 1176 |
|
|
@@ -1180,7 +1271,6 @@ class LoadBalancedProviderPool:
|
|
| 1180 |
except Exception:
|
| 1181 |
pass
|
| 1182 |
|
| 1183 |
-
# Try primary
|
| 1184 |
try:
|
| 1185 |
yield from self._call_provider_stream(primary, **kwargs)
|
| 1186 |
return
|
|
@@ -1191,7 +1281,6 @@ class LoadBalancedProviderPool:
|
|
| 1191 |
f"for '{self.model_id}' failed: {primary_err}"
|
| 1192 |
)
|
| 1193 |
|
| 1194 |
-
# Failover
|
| 1195 |
for inst in self._get_ordered_instances():
|
| 1196 |
if inst is primary:
|
| 1197 |
continue
|
|
@@ -1242,6 +1331,7 @@ class LoadBalancedProviderPool:
|
|
| 1242 |
"model_id": self.model_id,
|
| 1243 |
"lb_enabled": self.mdef.lb_enabled,
|
| 1244 |
"pool_size": len(self._instances),
|
|
|
|
| 1245 |
"instances": [inst.get_instance_info() for inst in self._instances],
|
| 1246 |
}
|
| 1247 |
|
|
@@ -1280,14 +1370,13 @@ class MultiModelClient:
|
|
| 1280 |
return self._lb_pools[model_id]
|
| 1281 |
|
| 1282 |
def _ensure_ready(self, model_id: str) -> LoadBalancedProviderPool:
|
| 1283 |
-
|
| 1284 |
-
|
| 1285 |
-
has_ready = any(inst.ready for inst in pool._instances)
|
| 1286 |
if not has_ready:
|
| 1287 |
-
if not
|
| 1288 |
raise APIError(f"Cannot init any instance for {model_id}",
|
| 1289 |
"INIT_FAILED")
|
| 1290 |
-
return
|
| 1291 |
|
| 1292 |
@property
|
| 1293 |
def active_conversation(self) -> Conversation:
|
|
@@ -1312,16 +1401,15 @@ class MultiModelClient:
|
|
| 1312 |
|
| 1313 |
def init_model(self, model_id: str) -> bool:
|
| 1314 |
try:
|
| 1315 |
-
|
| 1316 |
-
return
|
| 1317 |
except Exception:
|
| 1318 |
return False
|
| 1319 |
|
| 1320 |
def init_model_all(self, model_id: str) -> int:
|
| 1321 |
-
"""Init all instances in the pool, return count of ready ones."""
|
| 1322 |
try:
|
| 1323 |
-
|
| 1324 |
-
return
|
| 1325 |
except Exception:
|
| 1326 |
return 0
|
| 1327 |
|
|
@@ -1450,8 +1538,8 @@ class MultiModelClient:
|
|
| 1450 |
|
| 1451 |
def get_status(self) -> Dict:
|
| 1452 |
lb_info = {}
|
| 1453 |
-
for model_id,
|
| 1454 |
-
lb_info[model_id] =
|
| 1455 |
|
| 1456 |
return {
|
| 1457 |
"version": VERSION,
|
|
@@ -1464,7 +1552,7 @@ class MultiModelClient:
|
|
| 1464 |
}
|
| 1465 |
|
| 1466 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1467 |
-
# SESSION POOL
|
| 1468 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1469 |
|
| 1470 |
class SessionPool:
|
|
@@ -1502,10 +1590,14 @@ ALIASES = {
|
|
| 1502 |
"cohere-vision": "command-a-vision",
|
| 1503 |
"command-translate": "command-a-translate",
|
| 1504 |
"cohere-translate": "command-a-translate", "translate": "command-a-translate",
|
|
|
|
|
|
|
| 1505 |
"minimax": "minimax-vl-01", "minimax-vl": "minimax-vl-01",
|
| 1506 |
"glm": "glm-4.5", "glm4": "glm-4.5", "glm-4": "glm-4.5", "zhipu": "glm-4.5",
|
| 1507 |
"gpt": "chatgpt", "gpt-3.5": "chatgpt", "gpt3": "chatgpt", "openai": "chatgpt",
|
| 1508 |
"qwen": "qwen3-vl", "qwen3": "qwen3-vl", "qwen-vl": "qwen3-vl",
|
|
|
|
|
|
|
| 1509 |
}
|
| 1510 |
|
| 1511 |
|
|
@@ -1544,6 +1636,7 @@ def index():
|
|
| 1544 |
"default_model": config.default_model,
|
| 1545 |
"features": ["load_balancing", "10_req_per_second_limit", "failover"],
|
| 1546 |
"models": list(MODEL_REGISTRY.keys()),
|
|
|
|
| 1547 |
"endpoints": {
|
| 1548 |
"POST /chat": "Chat with any model",
|
| 1549 |
"POST /chat/stream": "Streaming chat",
|
|
@@ -1568,14 +1661,22 @@ def chat():
|
|
| 1568 |
client = pool.acquire()
|
| 1569 |
if data.get("new_conversation"):
|
| 1570 |
client.new_conversation(data.get("system_prompt"), model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1571 |
result = client.send_message(
|
| 1572 |
message, model=model_id,
|
| 1573 |
system_prompt=data.get("system_prompt"),
|
| 1574 |
temperature=data.get("temperature"),
|
| 1575 |
max_tokens=data.get("max_tokens"),
|
| 1576 |
include_thinking=include_thinking,
|
|
|
|
| 1577 |
)
|
| 1578 |
thinking, clean = ThinkingParser.split(result)
|
|
|
|
| 1579 |
resp = {
|
| 1580 |
"ok": True,
|
| 1581 |
"response": clean,
|
|
@@ -1585,6 +1686,8 @@ def chat():
|
|
| 1585 |
}
|
| 1586 |
if thinking:
|
| 1587 |
resp["thinking"] = thinking
|
|
|
|
|
|
|
| 1588 |
return jsonify(resp)
|
| 1589 |
|
| 1590 |
|
|
@@ -1602,6 +1705,10 @@ def chat_stream():
|
|
| 1602 |
mdef = MODEL_REGISTRY.get(model_id)
|
| 1603 |
use_stream = mdef.supports_streaming if mdef else False
|
| 1604 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1605 |
def generate():
|
| 1606 |
try:
|
| 1607 |
if use_stream:
|
|
@@ -1611,6 +1718,7 @@ def chat_stream():
|
|
| 1611 |
temperature=data.get("temperature"),
|
| 1612 |
max_tokens=data.get("max_tokens"),
|
| 1613 |
include_thinking=include_thinking,
|
|
|
|
| 1614 |
):
|
| 1615 |
yield f"data: {json.dumps({'chunk': chunk})}\n\n"
|
| 1616 |
else:
|
|
@@ -1620,6 +1728,7 @@ def chat_stream():
|
|
| 1620 |
temperature=data.get("temperature"),
|
| 1621 |
max_tokens=data.get("max_tokens"),
|
| 1622 |
include_thinking=include_thinking,
|
|
|
|
| 1623 |
)
|
| 1624 |
yield f"data: {json.dumps({'chunk': result})}\n\n"
|
| 1625 |
yield "data: [DONE]\n\n"
|
|
@@ -1634,7 +1743,7 @@ def chat_stream():
|
|
| 1634 |
def list_models():
|
| 1635 |
models = []
|
| 1636 |
for mid, mdef in MODEL_REGISTRY.items():
|
| 1637 |
-
|
| 1638 |
"id": mid,
|
| 1639 |
"object": "model",
|
| 1640 |
"owned_by": mdef.owned_by,
|
|
@@ -1652,7 +1761,10 @@ def list_models():
|
|
| 1652 |
"enabled": mdef.lb_enabled,
|
| 1653 |
"pool_size": mdef.lb_pool_size,
|
| 1654 |
},
|
| 1655 |
-
}
|
|
|
|
|
|
|
|
|
|
| 1656 |
return jsonify({"object": "list", "data": models})
|
| 1657 |
|
| 1658 |
|
|
@@ -1692,7 +1804,6 @@ def openai_compat():
|
|
| 1692 |
client = pool.acquire()
|
| 1693 |
client.new_conversation(system_prompt, model_id)
|
| 1694 |
|
| 1695 |
-
# Add history from messages
|
| 1696 |
for msg in messages[:-1]:
|
| 1697 |
role = msg.get("role")
|
| 1698 |
content = msg.get("content", "")
|
|
@@ -1701,50 +1812,30 @@ def openai_compat():
|
|
| 1701 |
|
| 1702 |
mdef = MODEL_REGISTRY[model_id]
|
| 1703 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1704 |
if do_stream:
|
| 1705 |
def generate():
|
| 1706 |
try:
|
| 1707 |
-
yield (
|
| 1708 |
-
f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', "
|
| 1709 |
-
f"'created': created, 'model': model_id, 'choices': ["
|
| 1710 |
-
f"{{'index': 0, 'delta': {{'role': 'assistant'}}, "
|
| 1711 |
-
f"'finish_reason': None}}]})}\n\n"
|
| 1712 |
-
)
|
| 1713 |
if mdef.supports_streaming:
|
| 1714 |
for chunk in client.send_message(
|
| 1715 |
user_msg, stream=True, model=model_id,
|
| 1716 |
temperature=temperature, max_tokens=max_tokens,
|
| 1717 |
-
include_thinking=include_thinking,
|
| 1718 |
):
|
| 1719 |
-
yield (
|
| 1720 |
-
f"data: {json.dumps({'id': rid, "
|
| 1721 |
-
f"'object': 'chat.completion.chunk', "
|
| 1722 |
-
f"'created': created, 'model': model_id, "
|
| 1723 |
-
f"'choices': [{{'index': 0, "
|
| 1724 |
-
f"'delta': {{'content': chunk}}, "
|
| 1725 |
-
f"'finish_reason': None}}]})}\n\n"
|
| 1726 |
-
)
|
| 1727 |
else:
|
| 1728 |
result = client.send_message(
|
| 1729 |
user_msg, model=model_id, temperature=temperature,
|
| 1730 |
max_tokens=max_tokens,
|
| 1731 |
-
include_thinking=include_thinking,
|
| 1732 |
-
)
|
| 1733 |
-
yield (
|
| 1734 |
-
f"data: {json.dumps({'id': rid, "
|
| 1735 |
-
f"'object': 'chat.completion.chunk', "
|
| 1736 |
-
f"'created': created, 'model': model_id, "
|
| 1737 |
-
f"'choices': [{{'index': 0, "
|
| 1738 |
-
f"'delta': {{'content': result}}, "
|
| 1739 |
-
f"'finish_reason': None}}]})}\n\n"
|
| 1740 |
)
|
| 1741 |
-
|
| 1742 |
-
|
| 1743 |
-
f"'object': 'chat.completion.chunk', "
|
| 1744 |
-
f"'created': created, 'model': model_id, "
|
| 1745 |
-
f"'choices': [{{'index': 0, 'delta': {{}}, "
|
| 1746 |
-
f"'finish_reason': 'stop'}}]})}\n\n"
|
| 1747 |
-
)
|
| 1748 |
yield "data: [DONE]\n\n"
|
| 1749 |
except Exception as e:
|
| 1750 |
yield f"data: {json.dumps({'error': {'message': str(e)}})}\n\n"
|
|
@@ -1754,7 +1845,7 @@ def openai_compat():
|
|
| 1754 |
|
| 1755 |
result = client.send_message(
|
| 1756 |
user_msg, model=model_id, temperature=temperature,
|
| 1757 |
-
max_tokens=max_tokens, include_thinking=include_thinking,
|
| 1758 |
)
|
| 1759 |
return jsonify({
|
| 1760 |
"id": rid,
|
|
@@ -1800,11 +1891,10 @@ def metrics_endpoint():
|
|
| 1800 |
|
| 1801 |
@app.route("/lb/status", methods=["GET"])
|
| 1802 |
def lb_status():
|
| 1803 |
-
"""Detailed load balancer status for all models across all clients."""
|
| 1804 |
all_pools = {}
|
| 1805 |
for client in pool._clients:
|
| 1806 |
for model_id, lb_pool in client._lb_pools.items():
|
| 1807 |
-
key =
|
| 1808 |
if key not in all_pools:
|
| 1809 |
all_pools[key] = []
|
| 1810 |
all_pools[key].append(lb_pool.get_pool_info())
|
|
@@ -1835,13 +1925,16 @@ def init_model_ep():
|
|
| 1835 |
}), 400
|
| 1836 |
count = pool.init_model(model_id)
|
| 1837 |
mdef = MODEL_REGISTRY[model_id]
|
| 1838 |
-
|
| 1839 |
"ok": True,
|
| 1840 |
"model": model_id,
|
| 1841 |
"initialized_instances": count,
|
| 1842 |
"lb_enabled": mdef.lb_enabled,
|
| 1843 |
"pool_size_per_client": mdef.lb_pool_size,
|
| 1844 |
-
}
|
|
|
|
|
|
|
|
|
|
| 1845 |
|
| 1846 |
|
| 1847 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1854,10 +1947,11 @@ if __name__ == "__main__":
|
|
| 1854 |
log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
|
| 1855 |
log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
|
| 1856 |
for mid, mdef in MODEL_REGISTRY.items():
|
| 1857 |
-
|
| 1858 |
f"LB ON (pool={mdef.lb_pool_size})"
|
| 1859 |
if mdef.lb_enabled
|
| 1860 |
else "LB OFF (single instance)"
|
| 1861 |
)
|
| 1862 |
-
|
|
|
|
| 1863 |
app.run(host="0.0.0.0", port=port, threaded=True)
|
|
|
|
| 23 |
# CONFIG & CONSTANTS
|
| 24 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
|
| 26 |
+
VERSION = "2.4.0-hf-lb"
|
| 27 |
APP_NAME = "Multi-Model-AI-API"
|
| 28 |
DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
|
| 29 |
DEFAULT_MODEL = "gpt-oss-120b"
|
|
|
|
| 63 |
api_name: Optional[str] = None
|
| 64 |
extra_params: Dict[str, Any] = field(default_factory=dict)
|
| 65 |
clean_analysis: bool = False
|
| 66 |
+
lb_pool_size: int = 2
|
| 67 |
+
lb_enabled: bool = True
|
| 68 |
+
is_beta: bool = False # Beta flag for experimental models
|
| 69 |
|
| 70 |
MODEL_REGISTRY: Dict[str, ModelDef] = {}
|
| 71 |
|
|
|
|
| 103 |
extra_params={"max_new_tokens": 700},
|
| 104 |
lb_pool_size=1, lb_enabled=False, # NO load balancing for translate
|
| 105 |
))
|
| 106 |
+
# ββ NEW: Command-A Reasoning ββ
|
| 107 |
+
register_model(ModelDef(
|
| 108 |
+
model_id="command-a-reasoning", display_name="Cohere Command-A Reasoning",
|
| 109 |
+
provider_type="gradio_client", space_id="CohereLabs/command-a-reasoning",
|
| 110 |
+
owned_by="cohere", description="Cohere reasoning model with thinking budget",
|
| 111 |
+
api_name="/chat", supports_vision=False, supports_system_prompt=False,
|
| 112 |
+
supports_temperature=False, supports_streaming=False, supports_history=False,
|
| 113 |
+
supports_thinking=True, thinking_default=True, max_tokens_default=4096,
|
| 114 |
+
extra_params={"thinking_budget": 500},
|
| 115 |
+
lb_pool_size=2, lb_enabled=True,
|
| 116 |
+
))
|
| 117 |
register_model(ModelDef(
|
| 118 |
model_id="minimax-vl-01", display_name="MiniMax VL-01",
|
| 119 |
provider_type="gradio_client", space_id="MiniMaxAI/MiniMax-VL-01",
|
|
|
|
| 153 |
supports_thinking=False, max_tokens_default=4096,
|
| 154 |
lb_pool_size=2, lb_enabled=True,
|
| 155 |
))
|
| 156 |
+
# ββ NEW: Qwen2.5-Coder (BETA) ββ
|
| 157 |
+
register_model(ModelDef(
|
| 158 |
+
model_id="qwen2.5-coder", display_name="Qwen2.5-Coder Artifacts (BETA)",
|
| 159 |
+
provider_type="gradio_client", space_id="Qwen/Qwen2.5-Coder-Artifacts",
|
| 160 |
+
owned_by="alibaba", description="Alibaba Qwen2.5 Coder β code generation model (BETA)",
|
| 161 |
+
api_name="/generation_code", supports_vision=False, supports_system_prompt=True,
|
| 162 |
+
supports_temperature=False, supports_streaming=False, supports_history=False,
|
| 163 |
+
supports_thinking=False, max_tokens_default=4096,
|
| 164 |
+
extra_params={
|
| 165 |
+
"system_prompt_override": (
|
| 166 |
+
"You are a helpful assistant. You are a skilled programming assistant. "
|
| 167 |
+
"You help users write, debug, and understand code across all languages. "
|
| 168 |
+
"Respond with clear explanations and clean code. "
|
| 169 |
+
"Do NOT generate HTML artifacts or web page previews. "
|
| 170 |
+
"Do NOT wrap everything in a single HTML file. "
|
| 171 |
+
"Just provide the code the user asks for with explanations."
|
| 172 |
+
),
|
| 173 |
+
},
|
| 174 |
+
lb_pool_size=2, lb_enabled=True,
|
| 175 |
+
is_beta=True,
|
| 176 |
+
))
|
| 177 |
|
| 178 |
|
| 179 |
_init_registry()
|
|
|
|
| 190 |
max_retries: int = 3
|
| 191 |
retry_backoff_base: float = 1.5
|
| 192 |
retry_jitter: float = 0.5
|
| 193 |
+
rate_limit_rps: int = 10
|
| 194 |
+
rate_limit_burst: int = 15
|
| 195 |
pool_size: int = 2
|
| 196 |
max_history_messages: int = 50
|
| 197 |
max_message_length: int = 10000
|
|
|
|
| 374 |
return str(chatbot).strip() if chatbot else ""
|
| 375 |
return str(result)
|
| 376 |
|
| 377 |
+
@classmethod
|
| 378 |
+
def extract_qwen_coder_text(cls, result: Any) -> str:
|
| 379 |
+
"""Extract text from Qwen2.5-Coder /generation_code response.
|
| 380 |
+
Returns tuple of (markdown, html). We want the markdown part."""
|
| 381 |
+
if result is None:
|
| 382 |
+
return ""
|
| 383 |
+
if isinstance(result, str):
|
| 384 |
+
return result.strip()
|
| 385 |
+
if isinstance(result, tuple):
|
| 386 |
+
# /generation_code returns (markdown_str, html_str)
|
| 387 |
+
# We want the markdown part (index 0)
|
| 388 |
+
if len(result) >= 1 and isinstance(result[0], str):
|
| 389 |
+
text = result[0].strip()
|
| 390 |
+
if text:
|
| 391 |
+
return text
|
| 392 |
+
# Fallback to second element if first is empty
|
| 393 |
+
if len(result) >= 2 and isinstance(result[1], str):
|
| 394 |
+
return result[1].strip()
|
| 395 |
+
if isinstance(result, (list, dict)):
|
| 396 |
+
return str(result)
|
| 397 |
+
return str(result) if result else ""
|
| 398 |
+
|
| 399 |
@classmethod
|
| 400 |
def clean(cls, text: str, model_id: str = "",
|
| 401 |
include_thinking: bool = True) -> str:
|
|
|
|
| 513 |
requests_per_model: Dict[str, int] = field(default_factory=dict)
|
| 514 |
_latencies: deque = field(default_factory=lambda: deque(maxlen=1000), repr=False)
|
| 515 |
started_at: float = field(default_factory=time.time)
|
|
|
|
| 516 |
lb_total_dispatches: int = 0
|
| 517 |
lb_failovers: int = 0
|
| 518 |
|
|
|
|
| 572 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 573 |
|
| 574 |
class RateLimiter:
|
|
|
|
|
|
|
| 575 |
def __init__(self, rps: int = 10, burst: int = 15):
|
| 576 |
+
self.rate = float(rps)
|
| 577 |
self.max_tokens = float(burst)
|
| 578 |
self.tokens = float(burst)
|
| 579 |
self.last_refill = time.monotonic()
|
|
|
|
| 595 |
return True
|
| 596 |
if time.monotonic() >= deadline:
|
| 597 |
return False
|
| 598 |
+
time.sleep(0.05)
|
| 599 |
|
| 600 |
def get_info(self) -> Dict:
|
| 601 |
with self._lock:
|
|
|
|
| 701 |
self.instance_id = instance_id
|
| 702 |
self.ready = False
|
| 703 |
self._lock = threading.Lock()
|
|
|
|
| 704 |
self._consecutive_failures = 0
|
| 705 |
self._last_success_time = 0.0
|
| 706 |
self._last_failure_time = 0.0
|
|
|
|
| 736 |
|
| 737 |
@property
|
| 738 |
def health_score(self) -> float:
|
|
|
|
| 739 |
if not self.ready:
|
| 740 |
return 0.0
|
| 741 |
score = 1.0
|
|
|
|
| 742 |
score -= min(self._consecutive_failures * 0.2, 0.8)
|
|
|
|
| 743 |
if self._latencies:
|
| 744 |
avg = self.avg_latency
|
| 745 |
if avg > 10000:
|
| 746 |
score -= 0.3
|
| 747 |
elif avg > 5000:
|
| 748 |
score -= 0.15
|
|
|
|
| 749 |
if self._total_requests > 5:
|
| 750 |
fail_rate = self._total_failures / self._total_requests
|
| 751 |
score -= fail_rate * 0.4
|
|
|
|
| 949 |
max_new_tokens=max_new,
|
| 950 |
api_name=self.model_def.api_name,
|
| 951 |
)
|
| 952 |
+
|
| 953 |
elif mid == "command-a-translate":
|
| 954 |
max_new = (max_tokens
|
| 955 |
or self.model_def.extra_params.get("max_new_tokens", 700))
|
|
|
|
| 958 |
max_new_tokens=max_new,
|
| 959 |
api_name=self.model_def.api_name,
|
| 960 |
)
|
| 961 |
+
|
| 962 |
+
elif mid == "command-a-reasoning":
|
| 963 |
+
# Cohere Command-A Reasoning with thinking budget
|
| 964 |
+
thinking_budget = kw.get(
|
| 965 |
+
"thinking_budget",
|
| 966 |
+
self.model_def.extra_params.get("thinking_budget", 500),
|
| 967 |
+
)
|
| 968 |
+
result = self._client.predict(
|
| 969 |
+
message=message,
|
| 970 |
+
thinking_budget=thinking_budget,
|
| 971 |
+
api_name=self.model_def.api_name,
|
| 972 |
+
)
|
| 973 |
+
return self._extract_reasoning(result)
|
| 974 |
+
|
| 975 |
elif mid == "minimax-vl-01":
|
| 976 |
temp = (temperature if temperature is not None
|
| 977 |
else self.model_def.default_temperature)
|
|
|
|
| 984 |
max_tokens=max_tok, temperature=temp, top_p=top_p,
|
| 985 |
api_name=self.model_def.api_name,
|
| 986 |
)
|
| 987 |
+
|
| 988 |
elif mid == "glm-4.5":
|
| 989 |
sys_p = system_prompt or self.config.default_system_prompt
|
| 990 |
temp = (temperature if temperature is not None
|
|
|
|
| 999 |
api_name=self.model_def.api_name,
|
| 1000 |
)
|
| 1001 |
return self._extract_glm(result, include)
|
| 1002 |
+
|
| 1003 |
elif mid == "chatgpt":
|
| 1004 |
temp = (temperature if temperature is not None
|
| 1005 |
else self.model_def.default_temperature)
|
|
|
|
| 1017 |
)
|
| 1018 |
self._chat_counter += 1
|
| 1019 |
return ResponseCleaner.extract_chatgpt_text(result)
|
| 1020 |
+
|
| 1021 |
elif mid == "qwen3-vl":
|
| 1022 |
result = self._client.predict(
|
| 1023 |
input_value={"files": None, "text": message},
|
| 1024 |
api_name="/add_message",
|
| 1025 |
)
|
| 1026 |
return ResponseCleaner.extract_qwen_text(result)
|
| 1027 |
+
|
| 1028 |
+
elif mid == "qwen2.5-coder":
|
| 1029 |
+
# First set the system prompt to override artifacts behavior
|
| 1030 |
+
sys_override = self.model_def.extra_params.get(
|
| 1031 |
+
"system_prompt_override", ""
|
| 1032 |
+
)
|
| 1033 |
+
if sys_override:
|
| 1034 |
+
try:
|
| 1035 |
+
self._client.predict(
|
| 1036 |
+
input=sys_override,
|
| 1037 |
+
api_name="/lambda_1",
|
| 1038 |
+
)
|
| 1039 |
+
except Exception as e:
|
| 1040 |
+
log.warning(f"[qwen2.5-coder] Failed to set system prompt: {e}")
|
| 1041 |
+
|
| 1042 |
+
result = self._client.predict(
|
| 1043 |
+
query=message,
|
| 1044 |
+
api_name="/generation_code",
|
| 1045 |
+
)
|
| 1046 |
+
return ResponseCleaner.extract_qwen_coder_text(result)
|
| 1047 |
+
|
| 1048 |
else:
|
| 1049 |
raise APIError(f"Unknown model handler: {mid}")
|
| 1050 |
|
|
|
|
| 1061 |
except Exception as e:
|
| 1062 |
raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
|
| 1063 |
|
| 1064 |
+
def _extract_reasoning(self, result: Any) -> str:
|
| 1065 |
+
"""Extract response from Command-A Reasoning.
|
| 1066 |
+
The API returns str | float | bool | list | dict from the Json component."""
|
| 1067 |
+
if result is None:
|
| 1068 |
+
return ""
|
| 1069 |
+
if isinstance(result, str):
|
| 1070 |
+
return result.strip()
|
| 1071 |
+
if isinstance(result, dict):
|
| 1072 |
+
# Try common response keys
|
| 1073 |
+
for key in ("response", "output", "answer", "text", "content", "result"):
|
| 1074 |
+
if key in result:
|
| 1075 |
+
val = result[key]
|
| 1076 |
+
if isinstance(val, str):
|
| 1077 |
+
return val.strip()
|
| 1078 |
+
return str(val)
|
| 1079 |
+
# Check for thinking + response structure
|
| 1080 |
+
thinking = result.get("thinking", "")
|
| 1081 |
+
response = result.get("response", result.get("output", ""))
|
| 1082 |
+
if thinking and response:
|
| 1083 |
+
return f"<thinking>\n{thinking}\n</thinking>\n{response}"
|
| 1084 |
+
if response:
|
| 1085 |
+
return str(response).strip()
|
| 1086 |
+
# Fallback: serialize entire dict
|
| 1087 |
+
return json.dumps(result, ensure_ascii=False, indent=2)
|
| 1088 |
+
if isinstance(result, (list, tuple)):
|
| 1089 |
+
if len(result) == 1:
|
| 1090 |
+
return str(result[0]).strip()
|
| 1091 |
+
# Try to find text in list elements
|
| 1092 |
+
texts = []
|
| 1093 |
+
for item in result:
|
| 1094 |
+
if isinstance(item, str) and item.strip():
|
| 1095 |
+
texts.append(item.strip())
|
| 1096 |
+
if texts:
|
| 1097 |
+
return "\n".join(texts)
|
| 1098 |
+
return json.dumps(result, ensure_ascii=False)
|
| 1099 |
+
if isinstance(result, (int, float, bool)):
|
| 1100 |
+
return str(result)
|
| 1101 |
+
return str(result)
|
| 1102 |
+
|
| 1103 |
def _extract_glm(self, result, include_thinking: bool = True) -> str:
|
| 1104 |
if isinstance(result, tuple) and len(result) >= 1:
|
| 1105 |
chatbot = result[0]
|
|
|
|
| 1120 |
return ResponseCleaner.clean_glm(str(result), include_thinking)
|
| 1121 |
|
| 1122 |
|
| 1123 |
+
# Factory
|
| 1124 |
def create_provider(model_id: str, config: Config,
|
| 1125 |
instance_id: int = 0) -> ModelProvider:
|
| 1126 |
if model_id not in MODEL_REGISTRY:
|
|
|
|
| 1131 |
return GradioClientProvider(mdef, config, instance_id)
|
| 1132 |
|
| 1133 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1134 |
+
# LOAD BALANCER
|
|
|
|
| 1135 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1136 |
|
| 1137 |
class LoadBalancedProviderPool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1138 |
def __init__(self, model_id: str, config: Config):
|
| 1139 |
self.model_id = model_id
|
| 1140 |
self.config = config
|
|
|
|
| 1157 |
return len(self._instances)
|
| 1158 |
|
| 1159 |
def initialize_all(self) -> int:
|
|
|
|
| 1160 |
ok = 0
|
| 1161 |
for inst in self._instances:
|
| 1162 |
try:
|
|
|
|
| 1170 |
return ok
|
| 1171 |
|
| 1172 |
def initialize_one(self) -> bool:
|
|
|
|
| 1173 |
for inst in self._instances:
|
| 1174 |
try:
|
| 1175 |
if inst.initialize():
|
|
|
|
| 1179 |
return False
|
| 1180 |
|
| 1181 |
def _select_instance(self) -> ModelProvider:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1182 |
if len(self._instances) == 1:
|
| 1183 |
return self._instances[0]
|
| 1184 |
|
| 1185 |
with self._lock:
|
|
|
|
| 1186 |
scored = []
|
| 1187 |
for inst in self._instances:
|
| 1188 |
score = inst.health_score
|
|
|
|
| 1189 |
scored.append((inst, max(score, 0.05)))
|
| 1190 |
|
| 1191 |
total_weight = sum(s for _, s in scored)
|
| 1192 |
if total_weight <= 0:
|
|
|
|
| 1193 |
inst = self._instances[self._rr_index % len(self._instances)]
|
| 1194 |
self._rr_index += 1
|
| 1195 |
return inst
|
| 1196 |
|
|
|
|
| 1197 |
r = random.uniform(0, total_weight)
|
| 1198 |
cumulative = 0.0
|
| 1199 |
for inst, weight in scored:
|
|
|
|
| 1201 |
if r <= cumulative:
|
| 1202 |
return inst
|
| 1203 |
|
|
|
|
| 1204 |
return scored[-1][0]
|
| 1205 |
|
| 1206 |
def _get_ordered_instances(self) -> List[ModelProvider]:
|
|
|
|
| 1207 |
return sorted(self._instances, key=lambda p: p.health_score, reverse=True)
|
| 1208 |
|
| 1209 |
def execute(self, fn_name: str, **kwargs) -> Any:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1210 |
primary = self._select_instance()
|
| 1211 |
metrics.record_lb_dispatch()
|
| 1212 |
|
|
|
|
| 1213 |
if not primary.ready:
|
| 1214 |
try:
|
| 1215 |
primary.initialize()
|
| 1216 |
except Exception:
|
| 1217 |
pass
|
| 1218 |
|
|
|
|
| 1219 |
start = time.monotonic()
|
| 1220 |
try:
|
| 1221 |
result = self._call_provider(primary, fn_name, **kwargs)
|
|
|
|
| 1229 |
f"'{self.model_id}' failed: {primary_err}"
|
| 1230 |
)
|
| 1231 |
|
|
|
|
| 1232 |
for inst in self._get_ordered_instances():
|
| 1233 |
if inst is primary:
|
| 1234 |
continue
|
|
|
|
| 1262 |
)
|
| 1263 |
|
| 1264 |
def execute_stream(self, **kwargs) -> Generator[str, None, None]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1265 |
primary = self._select_instance()
|
| 1266 |
metrics.record_lb_dispatch()
|
| 1267 |
|
|
|
|
| 1271 |
except Exception:
|
| 1272 |
pass
|
| 1273 |
|
|
|
|
| 1274 |
try:
|
| 1275 |
yield from self._call_provider_stream(primary, **kwargs)
|
| 1276 |
return
|
|
|
|
| 1281 |
f"for '{self.model_id}' failed: {primary_err}"
|
| 1282 |
)
|
| 1283 |
|
|
|
|
| 1284 |
for inst in self._get_ordered_instances():
|
| 1285 |
if inst is primary:
|
| 1286 |
continue
|
|
|
|
| 1331 |
"model_id": self.model_id,
|
| 1332 |
"lb_enabled": self.mdef.lb_enabled,
|
| 1333 |
"pool_size": len(self._instances),
|
| 1334 |
+
"is_beta": self.mdef.is_beta,
|
| 1335 |
"instances": [inst.get_instance_info() for inst in self._instances],
|
| 1336 |
}
|
| 1337 |
|
|
|
|
| 1370 |
return self._lb_pools[model_id]
|
| 1371 |
|
| 1372 |
def _ensure_ready(self, model_id: str) -> LoadBalancedProviderPool:
|
| 1373 |
+
lb_pool = self._get_lb_pool(model_id)
|
| 1374 |
+
has_ready = any(inst.ready for inst in lb_pool._instances)
|
|
|
|
| 1375 |
if not has_ready:
|
| 1376 |
+
if not lb_pool.initialize_one():
|
| 1377 |
raise APIError(f"Cannot init any instance for {model_id}",
|
| 1378 |
"INIT_FAILED")
|
| 1379 |
+
return lb_pool
|
| 1380 |
|
| 1381 |
@property
|
| 1382 |
def active_conversation(self) -> Conversation:
|
|
|
|
| 1401 |
|
| 1402 |
def init_model(self, model_id: str) -> bool:
|
| 1403 |
try:
|
| 1404 |
+
lb_pool = self._get_lb_pool(model_id)
|
| 1405 |
+
return lb_pool.initialize_one()
|
| 1406 |
except Exception:
|
| 1407 |
return False
|
| 1408 |
|
| 1409 |
def init_model_all(self, model_id: str) -> int:
|
|
|
|
| 1410 |
try:
|
| 1411 |
+
lb_pool = self._get_lb_pool(model_id)
|
| 1412 |
+
return lb_pool.initialize_all()
|
| 1413 |
except Exception:
|
| 1414 |
return 0
|
| 1415 |
|
|
|
|
| 1538 |
|
| 1539 |
def get_status(self) -> Dict:
|
| 1540 |
lb_info = {}
|
| 1541 |
+
for model_id, lb_pool in self._lb_pools.items():
|
| 1542 |
+
lb_info[model_id] = lb_pool.get_pool_info()
|
| 1543 |
|
| 1544 |
return {
|
| 1545 |
"version": VERSION,
|
|
|
|
| 1552 |
}
|
| 1553 |
|
| 1554 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1555 |
+
# SESSION POOL
|
| 1556 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1557 |
|
| 1558 |
class SessionPool:
|
|
|
|
| 1590 |
"cohere-vision": "command-a-vision",
|
| 1591 |
"command-translate": "command-a-translate",
|
| 1592 |
"cohere-translate": "command-a-translate", "translate": "command-a-translate",
|
| 1593 |
+
"command-reasoning": "command-a-reasoning", "reasoning": "command-a-reasoning",
|
| 1594 |
+
"cohere-reasoning": "command-a-reasoning", "command-r": "command-a-reasoning",
|
| 1595 |
"minimax": "minimax-vl-01", "minimax-vl": "minimax-vl-01",
|
| 1596 |
"glm": "glm-4.5", "glm4": "glm-4.5", "glm-4": "glm-4.5", "zhipu": "glm-4.5",
|
| 1597 |
"gpt": "chatgpt", "gpt-3.5": "chatgpt", "gpt3": "chatgpt", "openai": "chatgpt",
|
| 1598 |
"qwen": "qwen3-vl", "qwen3": "qwen3-vl", "qwen-vl": "qwen3-vl",
|
| 1599 |
+
"qwen-coder": "qwen2.5-coder", "qwen2.5": "qwen2.5-coder",
|
| 1600 |
+
"qwen25-coder": "qwen2.5-coder", "coder": "qwen2.5-coder",
|
| 1601 |
}
|
| 1602 |
|
| 1603 |
|
|
|
|
| 1636 |
"default_model": config.default_model,
|
| 1637 |
"features": ["load_balancing", "10_req_per_second_limit", "failover"],
|
| 1638 |
"models": list(MODEL_REGISTRY.keys()),
|
| 1639 |
+
"beta_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.is_beta],
|
| 1640 |
"endpoints": {
|
| 1641 |
"POST /chat": "Chat with any model",
|
| 1642 |
"POST /chat/stream": "Streaming chat",
|
|
|
|
| 1661 |
client = pool.acquire()
|
| 1662 |
if data.get("new_conversation"):
|
| 1663 |
client.new_conversation(data.get("system_prompt"), model_id)
|
| 1664 |
+
|
| 1665 |
+
# Pass extra params for specific models
|
| 1666 |
+
extra = {}
|
| 1667 |
+
if model_id == "command-a-reasoning" and "thinking_budget" in data:
|
| 1668 |
+
extra["thinking_budget"] = data["thinking_budget"]
|
| 1669 |
+
|
| 1670 |
result = client.send_message(
|
| 1671 |
message, model=model_id,
|
| 1672 |
system_prompt=data.get("system_prompt"),
|
| 1673 |
temperature=data.get("temperature"),
|
| 1674 |
max_tokens=data.get("max_tokens"),
|
| 1675 |
include_thinking=include_thinking,
|
| 1676 |
+
**extra,
|
| 1677 |
)
|
| 1678 |
thinking, clean = ThinkingParser.split(result)
|
| 1679 |
+
mdef = MODEL_REGISTRY.get(model_id)
|
| 1680 |
resp = {
|
| 1681 |
"ok": True,
|
| 1682 |
"response": clean,
|
|
|
|
| 1686 |
}
|
| 1687 |
if thinking:
|
| 1688 |
resp["thinking"] = thinking
|
| 1689 |
+
if mdef and mdef.is_beta:
|
| 1690 |
+
resp["beta"] = True
|
| 1691 |
return jsonify(resp)
|
| 1692 |
|
| 1693 |
|
|
|
|
| 1705 |
mdef = MODEL_REGISTRY.get(model_id)
|
| 1706 |
use_stream = mdef.supports_streaming if mdef else False
|
| 1707 |
|
| 1708 |
+
extra = {}
|
| 1709 |
+
if model_id == "command-a-reasoning" and "thinking_budget" in data:
|
| 1710 |
+
extra["thinking_budget"] = data["thinking_budget"]
|
| 1711 |
+
|
| 1712 |
def generate():
|
| 1713 |
try:
|
| 1714 |
if use_stream:
|
|
|
|
| 1718 |
temperature=data.get("temperature"),
|
| 1719 |
max_tokens=data.get("max_tokens"),
|
| 1720 |
include_thinking=include_thinking,
|
| 1721 |
+
**extra,
|
| 1722 |
):
|
| 1723 |
yield f"data: {json.dumps({'chunk': chunk})}\n\n"
|
| 1724 |
else:
|
|
|
|
| 1728 |
temperature=data.get("temperature"),
|
| 1729 |
max_tokens=data.get("max_tokens"),
|
| 1730 |
include_thinking=include_thinking,
|
| 1731 |
+
**extra,
|
| 1732 |
)
|
| 1733 |
yield f"data: {json.dumps({'chunk': result})}\n\n"
|
| 1734 |
yield "data: [DONE]\n\n"
|
|
|
|
| 1743 |
def list_models():
|
| 1744 |
models = []
|
| 1745 |
for mid, mdef in MODEL_REGISTRY.items():
|
| 1746 |
+
model_info = {
|
| 1747 |
"id": mid,
|
| 1748 |
"object": "model",
|
| 1749 |
"owned_by": mdef.owned_by,
|
|
|
|
| 1761 |
"enabled": mdef.lb_enabled,
|
| 1762 |
"pool_size": mdef.lb_pool_size,
|
| 1763 |
},
|
| 1764 |
+
}
|
| 1765 |
+
if mdef.is_beta:
|
| 1766 |
+
model_info["beta"] = True
|
| 1767 |
+
models.append(model_info)
|
| 1768 |
return jsonify({"object": "list", "data": models})
|
| 1769 |
|
| 1770 |
|
|
|
|
| 1804 |
client = pool.acquire()
|
| 1805 |
client.new_conversation(system_prompt, model_id)
|
| 1806 |
|
|
|
|
| 1807 |
for msg in messages[:-1]:
|
| 1808 |
role = msg.get("role")
|
| 1809 |
content = msg.get("content", "")
|
|
|
|
| 1812 |
|
| 1813 |
mdef = MODEL_REGISTRY[model_id]
|
| 1814 |
|
| 1815 |
+
# Extra params
|
| 1816 |
+
extra = {}
|
| 1817 |
+
if model_id == "command-a-reasoning" and "thinking_budget" in data:
|
| 1818 |
+
extra["thinking_budget"] = data["thinking_budget"]
|
| 1819 |
+
|
| 1820 |
if do_stream:
|
| 1821 |
def generate():
|
| 1822 |
try:
|
| 1823 |
+
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'role': 'assistant'}, 'finish_reason': None}]})}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1824 |
if mdef.supports_streaming:
|
| 1825 |
for chunk in client.send_message(
|
| 1826 |
user_msg, stream=True, model=model_id,
|
| 1827 |
temperature=temperature, max_tokens=max_tokens,
|
| 1828 |
+
include_thinking=include_thinking, **extra,
|
| 1829 |
):
|
| 1830 |
+
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1831 |
else:
|
| 1832 |
result = client.send_message(
|
| 1833 |
user_msg, model=model_id, temperature=temperature,
|
| 1834 |
max_tokens=max_tokens,
|
| 1835 |
+
include_thinking=include_thinking, **extra,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1836 |
)
|
| 1837 |
+
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': result}, 'finish_reason': None}]})}\n\n"
|
| 1838 |
+
yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1839 |
yield "data: [DONE]\n\n"
|
| 1840 |
except Exception as e:
|
| 1841 |
yield f"data: {json.dumps({'error': {'message': str(e)}})}\n\n"
|
|
|
|
| 1845 |
|
| 1846 |
result = client.send_message(
|
| 1847 |
user_msg, model=model_id, temperature=temperature,
|
| 1848 |
+
max_tokens=max_tokens, include_thinking=include_thinking, **extra,
|
| 1849 |
)
|
| 1850 |
return jsonify({
|
| 1851 |
"id": rid,
|
|
|
|
| 1891 |
|
| 1892 |
@app.route("/lb/status", methods=["GET"])
|
| 1893 |
def lb_status():
|
|
|
|
| 1894 |
all_pools = {}
|
| 1895 |
for client in pool._clients:
|
| 1896 |
for model_id, lb_pool in client._lb_pools.items():
|
| 1897 |
+
key = model_id
|
| 1898 |
if key not in all_pools:
|
| 1899 |
all_pools[key] = []
|
| 1900 |
all_pools[key].append(lb_pool.get_pool_info())
|
|
|
|
| 1925 |
}), 400
|
| 1926 |
count = pool.init_model(model_id)
|
| 1927 |
mdef = MODEL_REGISTRY[model_id]
|
| 1928 |
+
resp = {
|
| 1929 |
"ok": True,
|
| 1930 |
"model": model_id,
|
| 1931 |
"initialized_instances": count,
|
| 1932 |
"lb_enabled": mdef.lb_enabled,
|
| 1933 |
"pool_size_per_client": mdef.lb_pool_size,
|
| 1934 |
+
}
|
| 1935 |
+
if mdef.is_beta:
|
| 1936 |
+
resp["beta"] = True
|
| 1937 |
+
return jsonify(resp)
|
| 1938 |
|
| 1939 |
|
| 1940 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1947 |
log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
|
| 1948 |
log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
|
| 1949 |
for mid, mdef in MODEL_REGISTRY.items():
|
| 1950 |
+
lb_str = (
|
| 1951 |
f"LB ON (pool={mdef.lb_pool_size})"
|
| 1952 |
if mdef.lb_enabled
|
| 1953 |
else "LB OFF (single instance)"
|
| 1954 |
)
|
| 1955 |
+
beta_str = " [BETA]" if mdef.is_beta else ""
|
| 1956 |
+
log.info(f" {mid}: {lb_str}{beta_str}")
|
| 1957 |
app.run(host="0.0.0.0", port=port, threaded=True)
|