MB-IDK commited on
Commit
f6799a8
Β·
verified Β·
1 Parent(s): 1cd3ed6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -107
app.py CHANGED
@@ -23,7 +23,7 @@ except ImportError:
23
  # CONFIG & CONSTANTS
24
  # ═══════════════════════════════════════════════════════════════
25
 
26
- VERSION = "2.3.0-hf-lb"
27
  APP_NAME = "Multi-Model-AI-API"
28
  DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
29
  DEFAULT_MODEL = "gpt-oss-120b"
@@ -63,9 +63,9 @@ class ModelDef:
63
  api_name: Optional[str] = None
64
  extra_params: Dict[str, Any] = field(default_factory=dict)
65
  clean_analysis: bool = False
66
- # Load balancing config per model
67
- lb_pool_size: int = 2 # number of provider instances for load balancing
68
- lb_enabled: bool = True # whether load balancing is enabled
69
 
70
  MODEL_REGISTRY: Dict[str, ModelDef] = {}
71
 
@@ -103,6 +103,17 @@ def _init_registry():
103
  extra_params={"max_new_tokens": 700},
104
  lb_pool_size=1, lb_enabled=False, # NO load balancing for translate
105
  ))
 
 
 
 
 
 
 
 
 
 
 
106
  register_model(ModelDef(
107
  model_id="minimax-vl-01", display_name="MiniMax VL-01",
108
  provider_type="gradio_client", space_id="MiniMaxAI/MiniMax-VL-01",
@@ -142,6 +153,27 @@ def _init_registry():
142
  supports_thinking=False, max_tokens_default=4096,
143
  lb_pool_size=2, lb_enabled=True,
144
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
 
147
  _init_registry()
@@ -158,8 +190,8 @@ class Config:
158
  max_retries: int = 3
159
  retry_backoff_base: float = 1.5
160
  retry_jitter: float = 0.5
161
- rate_limit_rps: int = 10 # requests per SECOND (changed from RPM)
162
- rate_limit_burst: int = 15 # burst capacity
163
  pool_size: int = 2
164
  max_history_messages: int = 50
165
  max_message_length: int = 10000
@@ -342,6 +374,28 @@ class ResponseCleaner:
342
  return str(chatbot).strip() if chatbot else ""
343
  return str(result)
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  @classmethod
346
  def clean(cls, text: str, model_id: str = "",
347
  include_thinking: bool = True) -> str:
@@ -459,7 +513,6 @@ class Metrics:
459
  requests_per_model: Dict[str, int] = field(default_factory=dict)
460
  _latencies: deque = field(default_factory=lambda: deque(maxlen=1000), repr=False)
461
  started_at: float = field(default_factory=time.time)
462
- # Load balancer metrics
463
  lb_total_dispatches: int = 0
464
  lb_failovers: int = 0
465
 
@@ -519,10 +572,8 @@ metrics = Metrics()
519
  # ═══════════════════════════════════════════════════════════════
520
 
521
  class RateLimiter:
522
- """Token-bucket rate limiter. Default: 10 requests/second with burst."""
523
-
524
  def __init__(self, rps: int = 10, burst: int = 15):
525
- self.rate = float(rps) # tokens per second
526
  self.max_tokens = float(burst)
527
  self.tokens = float(burst)
528
  self.last_refill = time.monotonic()
@@ -544,7 +595,7 @@ class RateLimiter:
544
  return True
545
  if time.monotonic() >= deadline:
546
  return False
547
- time.sleep(0.05) # short sleep for per-second limiting
548
 
549
  def get_info(self) -> Dict:
550
  with self._lock:
@@ -650,7 +701,6 @@ class ModelProvider(ABC):
650
  self.instance_id = instance_id
651
  self.ready = False
652
  self._lock = threading.Lock()
653
- # Per-instance health tracking
654
  self._consecutive_failures = 0
655
  self._last_success_time = 0.0
656
  self._last_failure_time = 0.0
@@ -686,20 +736,16 @@ class ModelProvider(ABC):
686
 
687
  @property
688
  def health_score(self) -> float:
689
- """0.0 (worst) to 1.0 (best). Used by load balancer to pick instance."""
690
  if not self.ready:
691
  return 0.0
692
  score = 1.0
693
- # Penalise consecutive failures
694
  score -= min(self._consecutive_failures * 0.2, 0.8)
695
- # Penalise high avg latency (>10s = bad)
696
  if self._latencies:
697
  avg = self.avg_latency
698
  if avg > 10000:
699
  score -= 0.3
700
  elif avg > 5000:
701
  score -= 0.15
702
- # Penalise high failure rate
703
  if self._total_requests > 5:
704
  fail_rate = self._total_failures / self._total_requests
705
  score -= fail_rate * 0.4
@@ -903,6 +949,7 @@ class GradioClientProvider(ModelProvider):
903
  max_new_tokens=max_new,
904
  api_name=self.model_def.api_name,
905
  )
 
906
  elif mid == "command-a-translate":
907
  max_new = (max_tokens
908
  or self.model_def.extra_params.get("max_new_tokens", 700))
@@ -911,6 +958,20 @@ class GradioClientProvider(ModelProvider):
911
  max_new_tokens=max_new,
912
  api_name=self.model_def.api_name,
913
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
914
  elif mid == "minimax-vl-01":
915
  temp = (temperature if temperature is not None
916
  else self.model_def.default_temperature)
@@ -923,6 +984,7 @@ class GradioClientProvider(ModelProvider):
923
  max_tokens=max_tok, temperature=temp, top_p=top_p,
924
  api_name=self.model_def.api_name,
925
  )
 
926
  elif mid == "glm-4.5":
927
  sys_p = system_prompt or self.config.default_system_prompt
928
  temp = (temperature if temperature is not None
@@ -937,6 +999,7 @@ class GradioClientProvider(ModelProvider):
937
  api_name=self.model_def.api_name,
938
  )
939
  return self._extract_glm(result, include)
 
940
  elif mid == "chatgpt":
941
  temp = (temperature if temperature is not None
942
  else self.model_def.default_temperature)
@@ -954,12 +1017,34 @@ class GradioClientProvider(ModelProvider):
954
  )
955
  self._chat_counter += 1
956
  return ResponseCleaner.extract_chatgpt_text(result)
 
957
  elif mid == "qwen3-vl":
958
  result = self._client.predict(
959
  input_value={"files": None, "text": message},
960
  api_name="/add_message",
961
  )
962
  return ResponseCleaner.extract_qwen_text(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
  else:
964
  raise APIError(f"Unknown model handler: {mid}")
965
 
@@ -976,6 +1061,45 @@ class GradioClientProvider(ModelProvider):
976
  except Exception as e:
977
  raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
  def _extract_glm(self, result, include_thinking: bool = True) -> str:
980
  if isinstance(result, tuple) and len(result) >= 1:
981
  chatbot = result[0]
@@ -996,7 +1120,7 @@ class GradioClientProvider(ModelProvider):
996
  return ResponseCleaner.clean_glm(str(result), include_thinking)
997
 
998
 
999
- # Factory β€” creates a single provider instance
1000
  def create_provider(model_id: str, config: Config,
1001
  instance_id: int = 0) -> ModelProvider:
1002
  if model_id not in MODEL_REGISTRY:
@@ -1007,18 +1131,10 @@ def create_provider(model_id: str, config: Config,
1007
  return GradioClientProvider(mdef, config, instance_id)
1008
 
1009
  # ═══════════════════════════════════════════════════════════════
1010
- # LOAD BALANCER β€” Per-model provider pool with health-aware
1011
- # round-robin + failover
1012
  # ═══════════════════════════════════════════════════════════════
1013
 
1014
  class LoadBalancedProviderPool:
1015
- """
1016
- Manages multiple provider instances for a single model.
1017
- Selects the best instance based on health score with
1018
- weighted-random selection (healthier instances chosen more).
1019
- Falls back through all instances on failure.
1020
- """
1021
-
1022
  def __init__(self, model_id: str, config: Config):
1023
  self.model_id = model_id
1024
  self.config = config
@@ -1041,7 +1157,6 @@ class LoadBalancedProviderPool:
1041
  return len(self._instances)
1042
 
1043
  def initialize_all(self) -> int:
1044
- """Initialize all instances, return count of successful ones."""
1045
  ok = 0
1046
  for inst in self._instances:
1047
  try:
@@ -1055,7 +1170,6 @@ class LoadBalancedProviderPool:
1055
  return ok
1056
 
1057
  def initialize_one(self) -> bool:
1058
- """Initialize at least one instance."""
1059
  for inst in self._instances:
1060
  try:
1061
  if inst.initialize():
@@ -1065,30 +1179,21 @@ class LoadBalancedProviderPool:
1065
  return False
1066
 
1067
  def _select_instance(self) -> ModelProvider:
1068
- """
1069
- Select best available instance.
1070
- Strategy: weighted random by health score.
1071
- If all have equal scores, falls back to round-robin.
1072
- """
1073
  if len(self._instances) == 1:
1074
  return self._instances[0]
1075
 
1076
  with self._lock:
1077
- # Collect health scores
1078
  scored = []
1079
  for inst in self._instances:
1080
  score = inst.health_score
1081
- # Give a minimum weight so unhealthy instances can still recover
1082
  scored.append((inst, max(score, 0.05)))
1083
 
1084
  total_weight = sum(s for _, s in scored)
1085
  if total_weight <= 0:
1086
- # All dead, just round-robin
1087
  inst = self._instances[self._rr_index % len(self._instances)]
1088
  self._rr_index += 1
1089
  return inst
1090
 
1091
- # Weighted random selection
1092
  r = random.uniform(0, total_weight)
1093
  cumulative = 0.0
1094
  for inst, weight in scored:
@@ -1096,29 +1201,21 @@ class LoadBalancedProviderPool:
1096
  if r <= cumulative:
1097
  return inst
1098
 
1099
- # Fallback
1100
  return scored[-1][0]
1101
 
1102
  def _get_ordered_instances(self) -> List[ModelProvider]:
1103
- """Return instances ordered by health score (best first)."""
1104
  return sorted(self._instances, key=lambda p: p.health_score, reverse=True)
1105
 
1106
  def execute(self, fn_name: str, **kwargs) -> Any:
1107
- """
1108
- Execute a provider method with automatic failover.
1109
- Tries the best instance first, fails over to others.
1110
- """
1111
  primary = self._select_instance()
1112
  metrics.record_lb_dispatch()
1113
 
1114
- # Ensure primary is ready
1115
  if not primary.ready:
1116
  try:
1117
  primary.initialize()
1118
  except Exception:
1119
  pass
1120
 
1121
- # Try primary
1122
  start = time.monotonic()
1123
  try:
1124
  result = self._call_provider(primary, fn_name, **kwargs)
@@ -1132,7 +1229,6 @@ class LoadBalancedProviderPool:
1132
  f"'{self.model_id}' failed: {primary_err}"
1133
  )
1134
 
1135
- # Failover through remaining instances
1136
  for inst in self._get_ordered_instances():
1137
  if inst is primary:
1138
  continue
@@ -1166,11 +1262,6 @@ class LoadBalancedProviderPool:
1166
  )
1167
 
1168
  def execute_stream(self, **kwargs) -> Generator[str, None, None]:
1169
- """
1170
- Execute streaming with failover.
1171
- Since generators can't easily be retried mid-stream,
1172
- we do failover only on initial connection failure.
1173
- """
1174
  primary = self._select_instance()
1175
  metrics.record_lb_dispatch()
1176
 
@@ -1180,7 +1271,6 @@ class LoadBalancedProviderPool:
1180
  except Exception:
1181
  pass
1182
 
1183
- # Try primary
1184
  try:
1185
  yield from self._call_provider_stream(primary, **kwargs)
1186
  return
@@ -1191,7 +1281,6 @@ class LoadBalancedProviderPool:
1191
  f"for '{self.model_id}' failed: {primary_err}"
1192
  )
1193
 
1194
- # Failover
1195
  for inst in self._get_ordered_instances():
1196
  if inst is primary:
1197
  continue
@@ -1242,6 +1331,7 @@ class LoadBalancedProviderPool:
1242
  "model_id": self.model_id,
1243
  "lb_enabled": self.mdef.lb_enabled,
1244
  "pool_size": len(self._instances),
 
1245
  "instances": [inst.get_instance_info() for inst in self._instances],
1246
  }
1247
 
@@ -1280,14 +1370,13 @@ class MultiModelClient:
1280
  return self._lb_pools[model_id]
1281
 
1282
  def _ensure_ready(self, model_id: str) -> LoadBalancedProviderPool:
1283
- pool = self._get_lb_pool(model_id)
1284
- # Make sure at least one instance is ready
1285
- has_ready = any(inst.ready for inst in pool._instances)
1286
  if not has_ready:
1287
- if not pool.initialize_one():
1288
  raise APIError(f"Cannot init any instance for {model_id}",
1289
  "INIT_FAILED")
1290
- return pool
1291
 
1292
  @property
1293
  def active_conversation(self) -> Conversation:
@@ -1312,16 +1401,15 @@ class MultiModelClient:
1312
 
1313
  def init_model(self, model_id: str) -> bool:
1314
  try:
1315
- pool = self._get_lb_pool(model_id)
1316
- return pool.initialize_one()
1317
  except Exception:
1318
  return False
1319
 
1320
  def init_model_all(self, model_id: str) -> int:
1321
- """Init all instances in the pool, return count of ready ones."""
1322
  try:
1323
- pool = self._get_lb_pool(model_id)
1324
- return pool.initialize_all()
1325
  except Exception:
1326
  return 0
1327
 
@@ -1450,8 +1538,8 @@ class MultiModelClient:
1450
 
1451
  def get_status(self) -> Dict:
1452
  lb_info = {}
1453
- for model_id, pool in self._lb_pools.items():
1454
- lb_info[model_id] = pool.get_pool_info()
1455
 
1456
  return {
1457
  "version": VERSION,
@@ -1464,7 +1552,7 @@ class MultiModelClient:
1464
  }
1465
 
1466
  # ═══════════════════════════════════════════════════════════════
1467
- # SESSION POOL (top-level pool of MultiModelClients)
1468
  # ═══════════════════════════════════════════════════════════════
1469
 
1470
  class SessionPool:
@@ -1502,10 +1590,14 @@ ALIASES = {
1502
  "cohere-vision": "command-a-vision",
1503
  "command-translate": "command-a-translate",
1504
  "cohere-translate": "command-a-translate", "translate": "command-a-translate",
 
 
1505
  "minimax": "minimax-vl-01", "minimax-vl": "minimax-vl-01",
1506
  "glm": "glm-4.5", "glm4": "glm-4.5", "glm-4": "glm-4.5", "zhipu": "glm-4.5",
1507
  "gpt": "chatgpt", "gpt-3.5": "chatgpt", "gpt3": "chatgpt", "openai": "chatgpt",
1508
  "qwen": "qwen3-vl", "qwen3": "qwen3-vl", "qwen-vl": "qwen3-vl",
 
 
1509
  }
1510
 
1511
 
@@ -1544,6 +1636,7 @@ def index():
1544
  "default_model": config.default_model,
1545
  "features": ["load_balancing", "10_req_per_second_limit", "failover"],
1546
  "models": list(MODEL_REGISTRY.keys()),
 
1547
  "endpoints": {
1548
  "POST /chat": "Chat with any model",
1549
  "POST /chat/stream": "Streaming chat",
@@ -1568,14 +1661,22 @@ def chat():
1568
  client = pool.acquire()
1569
  if data.get("new_conversation"):
1570
  client.new_conversation(data.get("system_prompt"), model_id)
 
 
 
 
 
 
1571
  result = client.send_message(
1572
  message, model=model_id,
1573
  system_prompt=data.get("system_prompt"),
1574
  temperature=data.get("temperature"),
1575
  max_tokens=data.get("max_tokens"),
1576
  include_thinking=include_thinking,
 
1577
  )
1578
  thinking, clean = ThinkingParser.split(result)
 
1579
  resp = {
1580
  "ok": True,
1581
  "response": clean,
@@ -1585,6 +1686,8 @@ def chat():
1585
  }
1586
  if thinking:
1587
  resp["thinking"] = thinking
 
 
1588
  return jsonify(resp)
1589
 
1590
 
@@ -1602,6 +1705,10 @@ def chat_stream():
1602
  mdef = MODEL_REGISTRY.get(model_id)
1603
  use_stream = mdef.supports_streaming if mdef else False
1604
 
 
 
 
 
1605
  def generate():
1606
  try:
1607
  if use_stream:
@@ -1611,6 +1718,7 @@ def chat_stream():
1611
  temperature=data.get("temperature"),
1612
  max_tokens=data.get("max_tokens"),
1613
  include_thinking=include_thinking,
 
1614
  ):
1615
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
1616
  else:
@@ -1620,6 +1728,7 @@ def chat_stream():
1620
  temperature=data.get("temperature"),
1621
  max_tokens=data.get("max_tokens"),
1622
  include_thinking=include_thinking,
 
1623
  )
1624
  yield f"data: {json.dumps({'chunk': result})}\n\n"
1625
  yield "data: [DONE]\n\n"
@@ -1634,7 +1743,7 @@ def chat_stream():
1634
  def list_models():
1635
  models = []
1636
  for mid, mdef in MODEL_REGISTRY.items():
1637
- models.append({
1638
  "id": mid,
1639
  "object": "model",
1640
  "owned_by": mdef.owned_by,
@@ -1652,7 +1761,10 @@ def list_models():
1652
  "enabled": mdef.lb_enabled,
1653
  "pool_size": mdef.lb_pool_size,
1654
  },
1655
- })
 
 
 
1656
  return jsonify({"object": "list", "data": models})
1657
 
1658
 
@@ -1692,7 +1804,6 @@ def openai_compat():
1692
  client = pool.acquire()
1693
  client.new_conversation(system_prompt, model_id)
1694
 
1695
- # Add history from messages
1696
  for msg in messages[:-1]:
1697
  role = msg.get("role")
1698
  content = msg.get("content", "")
@@ -1701,50 +1812,30 @@ def openai_compat():
1701
 
1702
  mdef = MODEL_REGISTRY[model_id]
1703
 
 
 
 
 
 
1704
  if do_stream:
1705
  def generate():
1706
  try:
1707
- yield (
1708
- f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', "
1709
- f"'created': created, 'model': model_id, 'choices': ["
1710
- f"{{'index': 0, 'delta': {{'role': 'assistant'}}, "
1711
- f"'finish_reason': None}}]})}\n\n"
1712
- )
1713
  if mdef.supports_streaming:
1714
  for chunk in client.send_message(
1715
  user_msg, stream=True, model=model_id,
1716
  temperature=temperature, max_tokens=max_tokens,
1717
- include_thinking=include_thinking,
1718
  ):
1719
- yield (
1720
- f"data: {json.dumps({'id': rid, "
1721
- f"'object': 'chat.completion.chunk', "
1722
- f"'created': created, 'model': model_id, "
1723
- f"'choices': [{{'index': 0, "
1724
- f"'delta': {{'content': chunk}}, "
1725
- f"'finish_reason': None}}]})}\n\n"
1726
- )
1727
  else:
1728
  result = client.send_message(
1729
  user_msg, model=model_id, temperature=temperature,
1730
  max_tokens=max_tokens,
1731
- include_thinking=include_thinking,
1732
- )
1733
- yield (
1734
- f"data: {json.dumps({'id': rid, "
1735
- f"'object': 'chat.completion.chunk', "
1736
- f"'created': created, 'model': model_id, "
1737
- f"'choices': [{{'index': 0, "
1738
- f"'delta': {{'content': result}}, "
1739
- f"'finish_reason': None}}]})}\n\n"
1740
  )
1741
- yield (
1742
- f"data: {json.dumps({'id': rid, "
1743
- f"'object': 'chat.completion.chunk', "
1744
- f"'created': created, 'model': model_id, "
1745
- f"'choices': [{{'index': 0, 'delta': {{}}, "
1746
- f"'finish_reason': 'stop'}}]})}\n\n"
1747
- )
1748
  yield "data: [DONE]\n\n"
1749
  except Exception as e:
1750
  yield f"data: {json.dumps({'error': {'message': str(e)}})}\n\n"
@@ -1754,7 +1845,7 @@ def openai_compat():
1754
 
1755
  result = client.send_message(
1756
  user_msg, model=model_id, temperature=temperature,
1757
- max_tokens=max_tokens, include_thinking=include_thinking,
1758
  )
1759
  return jsonify({
1760
  "id": rid,
@@ -1800,11 +1891,10 @@ def metrics_endpoint():
1800
 
1801
  @app.route("/lb/status", methods=["GET"])
1802
  def lb_status():
1803
- """Detailed load balancer status for all models across all clients."""
1804
  all_pools = {}
1805
  for client in pool._clients:
1806
  for model_id, lb_pool in client._lb_pools.items():
1807
- key = f"{model_id}"
1808
  if key not in all_pools:
1809
  all_pools[key] = []
1810
  all_pools[key].append(lb_pool.get_pool_info())
@@ -1835,13 +1925,16 @@ def init_model_ep():
1835
  }), 400
1836
  count = pool.init_model(model_id)
1837
  mdef = MODEL_REGISTRY[model_id]
1838
- return jsonify({
1839
  "ok": True,
1840
  "model": model_id,
1841
  "initialized_instances": count,
1842
  "lb_enabled": mdef.lb_enabled,
1843
  "pool_size_per_client": mdef.lb_pool_size,
1844
- })
 
 
 
1845
 
1846
 
1847
  # ═══════════════════════════════════════════════════════════════
@@ -1854,10 +1947,11 @@ if __name__ == "__main__":
1854
  log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
1855
  log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
1856
  for mid, mdef in MODEL_REGISTRY.items():
1857
- lb_status_str = (
1858
  f"LB ON (pool={mdef.lb_pool_size})"
1859
  if mdef.lb_enabled
1860
  else "LB OFF (single instance)"
1861
  )
1862
- log.info(f" {mid}: {lb_status_str}")
 
1863
  app.run(host="0.0.0.0", port=port, threaded=True)
 
23
  # CONFIG & CONSTANTS
24
  # ═══════════════════════════════════════════════════════════════
25
 
26
+ VERSION = "2.4.0-hf-lb"
27
  APP_NAME = "Multi-Model-AI-API"
28
  DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
29
  DEFAULT_MODEL = "gpt-oss-120b"
 
63
  api_name: Optional[str] = None
64
  extra_params: Dict[str, Any] = field(default_factory=dict)
65
  clean_analysis: bool = False
66
+ lb_pool_size: int = 2
67
+ lb_enabled: bool = True
68
+ is_beta: bool = False # Beta flag for experimental models
69
 
70
  MODEL_REGISTRY: Dict[str, ModelDef] = {}
71
 
 
103
  extra_params={"max_new_tokens": 700},
104
  lb_pool_size=1, lb_enabled=False, # NO load balancing for translate
105
  ))
106
+ # ── NEW: Command-A Reasoning ──
107
+ register_model(ModelDef(
108
+ model_id="command-a-reasoning", display_name="Cohere Command-A Reasoning",
109
+ provider_type="gradio_client", space_id="CohereLabs/command-a-reasoning",
110
+ owned_by="cohere", description="Cohere reasoning model with thinking budget",
111
+ api_name="/chat", supports_vision=False, supports_system_prompt=False,
112
+ supports_temperature=False, supports_streaming=False, supports_history=False,
113
+ supports_thinking=True, thinking_default=True, max_tokens_default=4096,
114
+ extra_params={"thinking_budget": 500},
115
+ lb_pool_size=2, lb_enabled=True,
116
+ ))
117
  register_model(ModelDef(
118
  model_id="minimax-vl-01", display_name="MiniMax VL-01",
119
  provider_type="gradio_client", space_id="MiniMaxAI/MiniMax-VL-01",
 
153
  supports_thinking=False, max_tokens_default=4096,
154
  lb_pool_size=2, lb_enabled=True,
155
  ))
156
+ # ── NEW: Qwen2.5-Coder (BETA) ──
157
+ register_model(ModelDef(
158
+ model_id="qwen2.5-coder", display_name="Qwen2.5-Coder Artifacts (BETA)",
159
+ provider_type="gradio_client", space_id="Qwen/Qwen2.5-Coder-Artifacts",
160
+ owned_by="alibaba", description="Alibaba Qwen2.5 Coder β€” code generation model (BETA)",
161
+ api_name="/generation_code", supports_vision=False, supports_system_prompt=True,
162
+ supports_temperature=False, supports_streaming=False, supports_history=False,
163
+ supports_thinking=False, max_tokens_default=4096,
164
+ extra_params={
165
+ "system_prompt_override": (
166
+ "You are a helpful assistant. You are a skilled programming assistant. "
167
+ "You help users write, debug, and understand code across all languages. "
168
+ "Respond with clear explanations and clean code. "
169
+ "Do NOT generate HTML artifacts or web page previews. "
170
+ "Do NOT wrap everything in a single HTML file. "
171
+ "Just provide the code the user asks for with explanations."
172
+ ),
173
+ },
174
+ lb_pool_size=2, lb_enabled=True,
175
+ is_beta=True,
176
+ ))
177
 
178
 
179
  _init_registry()
 
190
  max_retries: int = 3
191
  retry_backoff_base: float = 1.5
192
  retry_jitter: float = 0.5
193
+ rate_limit_rps: int = 10
194
+ rate_limit_burst: int = 15
195
  pool_size: int = 2
196
  max_history_messages: int = 50
197
  max_message_length: int = 10000
 
374
  return str(chatbot).strip() if chatbot else ""
375
  return str(result)
376
 
377
+ @classmethod
378
+ def extract_qwen_coder_text(cls, result: Any) -> str:
379
+ """Extract text from Qwen2.5-Coder /generation_code response.
380
+ Returns tuple of (markdown, html). We want the markdown part."""
381
+ if result is None:
382
+ return ""
383
+ if isinstance(result, str):
384
+ return result.strip()
385
+ if isinstance(result, tuple):
386
+ # /generation_code returns (markdown_str, html_str)
387
+ # We want the markdown part (index 0)
388
+ if len(result) >= 1 and isinstance(result[0], str):
389
+ text = result[0].strip()
390
+ if text:
391
+ return text
392
+ # Fallback to second element if first is empty
393
+ if len(result) >= 2 and isinstance(result[1], str):
394
+ return result[1].strip()
395
+ if isinstance(result, (list, dict)):
396
+ return str(result)
397
+ return str(result) if result else ""
398
+
399
  @classmethod
400
  def clean(cls, text: str, model_id: str = "",
401
  include_thinking: bool = True) -> str:
 
513
  requests_per_model: Dict[str, int] = field(default_factory=dict)
514
  _latencies: deque = field(default_factory=lambda: deque(maxlen=1000), repr=False)
515
  started_at: float = field(default_factory=time.time)
 
516
  lb_total_dispatches: int = 0
517
  lb_failovers: int = 0
518
 
 
572
  # ═══════════════════════════════════════════════════════════════
573
 
574
  class RateLimiter:
 
 
575
  def __init__(self, rps: int = 10, burst: int = 15):
576
+ self.rate = float(rps)
577
  self.max_tokens = float(burst)
578
  self.tokens = float(burst)
579
  self.last_refill = time.monotonic()
 
595
  return True
596
  if time.monotonic() >= deadline:
597
  return False
598
+ time.sleep(0.05)
599
 
600
  def get_info(self) -> Dict:
601
  with self._lock:
 
701
  self.instance_id = instance_id
702
  self.ready = False
703
  self._lock = threading.Lock()
 
704
  self._consecutive_failures = 0
705
  self._last_success_time = 0.0
706
  self._last_failure_time = 0.0
 
736
 
737
  @property
738
  def health_score(self) -> float:
 
739
  if not self.ready:
740
  return 0.0
741
  score = 1.0
 
742
  score -= min(self._consecutive_failures * 0.2, 0.8)
 
743
  if self._latencies:
744
  avg = self.avg_latency
745
  if avg > 10000:
746
  score -= 0.3
747
  elif avg > 5000:
748
  score -= 0.15
 
749
  if self._total_requests > 5:
750
  fail_rate = self._total_failures / self._total_requests
751
  score -= fail_rate * 0.4
 
949
  max_new_tokens=max_new,
950
  api_name=self.model_def.api_name,
951
  )
952
+
953
  elif mid == "command-a-translate":
954
  max_new = (max_tokens
955
  or self.model_def.extra_params.get("max_new_tokens", 700))
 
958
  max_new_tokens=max_new,
959
  api_name=self.model_def.api_name,
960
  )
961
+
962
+ elif mid == "command-a-reasoning":
963
+ # Cohere Command-A Reasoning with thinking budget
964
+ thinking_budget = kw.get(
965
+ "thinking_budget",
966
+ self.model_def.extra_params.get("thinking_budget", 500),
967
+ )
968
+ result = self._client.predict(
969
+ message=message,
970
+ thinking_budget=thinking_budget,
971
+ api_name=self.model_def.api_name,
972
+ )
973
+ return self._extract_reasoning(result)
974
+
975
  elif mid == "minimax-vl-01":
976
  temp = (temperature if temperature is not None
977
  else self.model_def.default_temperature)
 
984
  max_tokens=max_tok, temperature=temp, top_p=top_p,
985
  api_name=self.model_def.api_name,
986
  )
987
+
988
  elif mid == "glm-4.5":
989
  sys_p = system_prompt or self.config.default_system_prompt
990
  temp = (temperature if temperature is not None
 
999
  api_name=self.model_def.api_name,
1000
  )
1001
  return self._extract_glm(result, include)
1002
+
1003
  elif mid == "chatgpt":
1004
  temp = (temperature if temperature is not None
1005
  else self.model_def.default_temperature)
 
1017
  )
1018
  self._chat_counter += 1
1019
  return ResponseCleaner.extract_chatgpt_text(result)
1020
+
1021
  elif mid == "qwen3-vl":
1022
  result = self._client.predict(
1023
  input_value={"files": None, "text": message},
1024
  api_name="/add_message",
1025
  )
1026
  return ResponseCleaner.extract_qwen_text(result)
1027
+
1028
+ elif mid == "qwen2.5-coder":
1029
+ # First set the system prompt to override artifacts behavior
1030
+ sys_override = self.model_def.extra_params.get(
1031
+ "system_prompt_override", ""
1032
+ )
1033
+ if sys_override:
1034
+ try:
1035
+ self._client.predict(
1036
+ input=sys_override,
1037
+ api_name="/lambda_1",
1038
+ )
1039
+ except Exception as e:
1040
+ log.warning(f"[qwen2.5-coder] Failed to set system prompt: {e}")
1041
+
1042
+ result = self._client.predict(
1043
+ query=message,
1044
+ api_name="/generation_code",
1045
+ )
1046
+ return ResponseCleaner.extract_qwen_coder_text(result)
1047
+
1048
  else:
1049
  raise APIError(f"Unknown model handler: {mid}")
1050
 
 
1061
  except Exception as e:
1062
  raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
1063
 
1064
+ def _extract_reasoning(self, result: Any) -> str:
1065
+ """Extract response from Command-A Reasoning.
1066
+ The API returns str | float | bool | list | dict from the Json component."""
1067
+ if result is None:
1068
+ return ""
1069
+ if isinstance(result, str):
1070
+ return result.strip()
1071
+ if isinstance(result, dict):
1072
+ # Try common response keys
1073
+ for key in ("response", "output", "answer", "text", "content", "result"):
1074
+ if key in result:
1075
+ val = result[key]
1076
+ if isinstance(val, str):
1077
+ return val.strip()
1078
+ return str(val)
1079
+ # Check for thinking + response structure
1080
+ thinking = result.get("thinking", "")
1081
+ response = result.get("response", result.get("output", ""))
1082
+ if thinking and response:
1083
+ return f"<thinking>\n{thinking}\n</thinking>\n{response}"
1084
+ if response:
1085
+ return str(response).strip()
1086
+ # Fallback: serialize entire dict
1087
+ return json.dumps(result, ensure_ascii=False, indent=2)
1088
+ if isinstance(result, (list, tuple)):
1089
+ if len(result) == 1:
1090
+ return str(result[0]).strip()
1091
+ # Try to find text in list elements
1092
+ texts = []
1093
+ for item in result:
1094
+ if isinstance(item, str) and item.strip():
1095
+ texts.append(item.strip())
1096
+ if texts:
1097
+ return "\n".join(texts)
1098
+ return json.dumps(result, ensure_ascii=False)
1099
+ if isinstance(result, (int, float, bool)):
1100
+ return str(result)
1101
+ return str(result)
1102
+
1103
  def _extract_glm(self, result, include_thinking: bool = True) -> str:
1104
  if isinstance(result, tuple) and len(result) >= 1:
1105
  chatbot = result[0]
 
1120
  return ResponseCleaner.clean_glm(str(result), include_thinking)
1121
 
1122
 
1123
+ # Factory
1124
  def create_provider(model_id: str, config: Config,
1125
  instance_id: int = 0) -> ModelProvider:
1126
  if model_id not in MODEL_REGISTRY:
 
1131
  return GradioClientProvider(mdef, config, instance_id)
1132
 
1133
  # ═══════════════════════════════════════════════════════════════
1134
+ # LOAD BALANCER
 
1135
  # ═══════════════════════════════════════════════════════════════
1136
 
1137
  class LoadBalancedProviderPool:
 
 
 
 
 
 
 
1138
  def __init__(self, model_id: str, config: Config):
1139
  self.model_id = model_id
1140
  self.config = config
 
1157
  return len(self._instances)
1158
 
1159
  def initialize_all(self) -> int:
 
1160
  ok = 0
1161
  for inst in self._instances:
1162
  try:
 
1170
  return ok
1171
 
1172
  def initialize_one(self) -> bool:
 
1173
  for inst in self._instances:
1174
  try:
1175
  if inst.initialize():
 
1179
  return False
1180
 
1181
  def _select_instance(self) -> ModelProvider:
 
 
 
 
 
1182
  if len(self._instances) == 1:
1183
  return self._instances[0]
1184
 
1185
  with self._lock:
 
1186
  scored = []
1187
  for inst in self._instances:
1188
  score = inst.health_score
 
1189
  scored.append((inst, max(score, 0.05)))
1190
 
1191
  total_weight = sum(s for _, s in scored)
1192
  if total_weight <= 0:
 
1193
  inst = self._instances[self._rr_index % len(self._instances)]
1194
  self._rr_index += 1
1195
  return inst
1196
 
 
1197
  r = random.uniform(0, total_weight)
1198
  cumulative = 0.0
1199
  for inst, weight in scored:
 
1201
  if r <= cumulative:
1202
  return inst
1203
 
 
1204
  return scored[-1][0]
1205
 
1206
  def _get_ordered_instances(self) -> List[ModelProvider]:
 
1207
  return sorted(self._instances, key=lambda p: p.health_score, reverse=True)
1208
 
1209
  def execute(self, fn_name: str, **kwargs) -> Any:
 
 
 
 
1210
  primary = self._select_instance()
1211
  metrics.record_lb_dispatch()
1212
 
 
1213
  if not primary.ready:
1214
  try:
1215
  primary.initialize()
1216
  except Exception:
1217
  pass
1218
 
 
1219
  start = time.monotonic()
1220
  try:
1221
  result = self._call_provider(primary, fn_name, **kwargs)
 
1229
  f"'{self.model_id}' failed: {primary_err}"
1230
  )
1231
 
 
1232
  for inst in self._get_ordered_instances():
1233
  if inst is primary:
1234
  continue
 
1262
  )
1263
 
1264
  def execute_stream(self, **kwargs) -> Generator[str, None, None]:
 
 
 
 
 
1265
  primary = self._select_instance()
1266
  metrics.record_lb_dispatch()
1267
 
 
1271
  except Exception:
1272
  pass
1273
 
 
1274
  try:
1275
  yield from self._call_provider_stream(primary, **kwargs)
1276
  return
 
1281
  f"for '{self.model_id}' failed: {primary_err}"
1282
  )
1283
 
 
1284
  for inst in self._get_ordered_instances():
1285
  if inst is primary:
1286
  continue
 
1331
  "model_id": self.model_id,
1332
  "lb_enabled": self.mdef.lb_enabled,
1333
  "pool_size": len(self._instances),
1334
+ "is_beta": self.mdef.is_beta,
1335
  "instances": [inst.get_instance_info() for inst in self._instances],
1336
  }
1337
 
 
1370
  return self._lb_pools[model_id]
1371
 
1372
  def _ensure_ready(self, model_id: str) -> LoadBalancedProviderPool:
1373
+ lb_pool = self._get_lb_pool(model_id)
1374
+ has_ready = any(inst.ready for inst in lb_pool._instances)
 
1375
  if not has_ready:
1376
+ if not lb_pool.initialize_one():
1377
  raise APIError(f"Cannot init any instance for {model_id}",
1378
  "INIT_FAILED")
1379
+ return lb_pool
1380
 
1381
  @property
1382
  def active_conversation(self) -> Conversation:
 
1401
 
1402
  def init_model(self, model_id: str) -> bool:
1403
  try:
1404
+ lb_pool = self._get_lb_pool(model_id)
1405
+ return lb_pool.initialize_one()
1406
  except Exception:
1407
  return False
1408
 
1409
  def init_model_all(self, model_id: str) -> int:
 
1410
  try:
1411
+ lb_pool = self._get_lb_pool(model_id)
1412
+ return lb_pool.initialize_all()
1413
  except Exception:
1414
  return 0
1415
 
 
1538
 
1539
  def get_status(self) -> Dict:
1540
  lb_info = {}
1541
+ for model_id, lb_pool in self._lb_pools.items():
1542
+ lb_info[model_id] = lb_pool.get_pool_info()
1543
 
1544
  return {
1545
  "version": VERSION,
 
1552
  }
1553
 
1554
  # ═══════════════════════════════════════════════════════════════
1555
+ # SESSION POOL
1556
  # ═══════════════════════════════════════════════════════════════
1557
 
1558
  class SessionPool:
 
1590
  "cohere-vision": "command-a-vision",
1591
  "command-translate": "command-a-translate",
1592
  "cohere-translate": "command-a-translate", "translate": "command-a-translate",
1593
+ "command-reasoning": "command-a-reasoning", "reasoning": "command-a-reasoning",
1594
+ "cohere-reasoning": "command-a-reasoning", "command-r": "command-a-reasoning",
1595
  "minimax": "minimax-vl-01", "minimax-vl": "minimax-vl-01",
1596
  "glm": "glm-4.5", "glm4": "glm-4.5", "glm-4": "glm-4.5", "zhipu": "glm-4.5",
1597
  "gpt": "chatgpt", "gpt-3.5": "chatgpt", "gpt3": "chatgpt", "openai": "chatgpt",
1598
  "qwen": "qwen3-vl", "qwen3": "qwen3-vl", "qwen-vl": "qwen3-vl",
1599
+ "qwen-coder": "qwen2.5-coder", "qwen2.5": "qwen2.5-coder",
1600
+ "qwen25-coder": "qwen2.5-coder", "coder": "qwen2.5-coder",
1601
  }
1602
 
1603
 
 
1636
  "default_model": config.default_model,
1637
  "features": ["load_balancing", "10_req_per_second_limit", "failover"],
1638
  "models": list(MODEL_REGISTRY.keys()),
1639
+ "beta_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.is_beta],
1640
  "endpoints": {
1641
  "POST /chat": "Chat with any model",
1642
  "POST /chat/stream": "Streaming chat",
 
1661
  client = pool.acquire()
1662
  if data.get("new_conversation"):
1663
  client.new_conversation(data.get("system_prompt"), model_id)
1664
+
1665
+ # Pass extra params for specific models
1666
+ extra = {}
1667
+ if model_id == "command-a-reasoning" and "thinking_budget" in data:
1668
+ extra["thinking_budget"] = data["thinking_budget"]
1669
+
1670
  result = client.send_message(
1671
  message, model=model_id,
1672
  system_prompt=data.get("system_prompt"),
1673
  temperature=data.get("temperature"),
1674
  max_tokens=data.get("max_tokens"),
1675
  include_thinking=include_thinking,
1676
+ **extra,
1677
  )
1678
  thinking, clean = ThinkingParser.split(result)
1679
+ mdef = MODEL_REGISTRY.get(model_id)
1680
  resp = {
1681
  "ok": True,
1682
  "response": clean,
 
1686
  }
1687
  if thinking:
1688
  resp["thinking"] = thinking
1689
+ if mdef and mdef.is_beta:
1690
+ resp["beta"] = True
1691
  return jsonify(resp)
1692
 
1693
 
 
1705
  mdef = MODEL_REGISTRY.get(model_id)
1706
  use_stream = mdef.supports_streaming if mdef else False
1707
 
1708
+ extra = {}
1709
+ if model_id == "command-a-reasoning" and "thinking_budget" in data:
1710
+ extra["thinking_budget"] = data["thinking_budget"]
1711
+
1712
  def generate():
1713
  try:
1714
  if use_stream:
 
1718
  temperature=data.get("temperature"),
1719
  max_tokens=data.get("max_tokens"),
1720
  include_thinking=include_thinking,
1721
+ **extra,
1722
  ):
1723
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
1724
  else:
 
1728
  temperature=data.get("temperature"),
1729
  max_tokens=data.get("max_tokens"),
1730
  include_thinking=include_thinking,
1731
+ **extra,
1732
  )
1733
  yield f"data: {json.dumps({'chunk': result})}\n\n"
1734
  yield "data: [DONE]\n\n"
 
1743
  def list_models():
1744
  models = []
1745
  for mid, mdef in MODEL_REGISTRY.items():
1746
+ model_info = {
1747
  "id": mid,
1748
  "object": "model",
1749
  "owned_by": mdef.owned_by,
 
1761
  "enabled": mdef.lb_enabled,
1762
  "pool_size": mdef.lb_pool_size,
1763
  },
1764
+ }
1765
+ if mdef.is_beta:
1766
+ model_info["beta"] = True
1767
+ models.append(model_info)
1768
  return jsonify({"object": "list", "data": models})
1769
 
1770
 
 
1804
  client = pool.acquire()
1805
  client.new_conversation(system_prompt, model_id)
1806
 
 
1807
  for msg in messages[:-1]:
1808
  role = msg.get("role")
1809
  content = msg.get("content", "")
 
1812
 
1813
  mdef = MODEL_REGISTRY[model_id]
1814
 
1815
+ # Extra params
1816
+ extra = {}
1817
+ if model_id == "command-a-reasoning" and "thinking_budget" in data:
1818
+ extra["thinking_budget"] = data["thinking_budget"]
1819
+
1820
  if do_stream:
1821
  def generate():
1822
  try:
1823
+ yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'role': 'assistant'}, 'finish_reason': None}]})}\n\n"
 
 
 
 
 
1824
  if mdef.supports_streaming:
1825
  for chunk in client.send_message(
1826
  user_msg, stream=True, model=model_id,
1827
  temperature=temperature, max_tokens=max_tokens,
1828
+ include_thinking=include_thinking, **extra,
1829
  ):
1830
+ yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
 
 
 
 
 
 
 
1831
  else:
1832
  result = client.send_message(
1833
  user_msg, model=model_id, temperature=temperature,
1834
  max_tokens=max_tokens,
1835
+ include_thinking=include_thinking, **extra,
 
 
 
 
 
 
 
 
1836
  )
1837
+ yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': result}, 'finish_reason': None}]})}\n\n"
1838
+ yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
 
 
 
 
 
1839
  yield "data: [DONE]\n\n"
1840
  except Exception as e:
1841
  yield f"data: {json.dumps({'error': {'message': str(e)}})}\n\n"
 
1845
 
1846
  result = client.send_message(
1847
  user_msg, model=model_id, temperature=temperature,
1848
+ max_tokens=max_tokens, include_thinking=include_thinking, **extra,
1849
  )
1850
  return jsonify({
1851
  "id": rid,
 
1891
 
1892
  @app.route("/lb/status", methods=["GET"])
1893
  def lb_status():
 
1894
  all_pools = {}
1895
  for client in pool._clients:
1896
  for model_id, lb_pool in client._lb_pools.items():
1897
+ key = model_id
1898
  if key not in all_pools:
1899
  all_pools[key] = []
1900
  all_pools[key].append(lb_pool.get_pool_info())
 
1925
  }), 400
1926
  count = pool.init_model(model_id)
1927
  mdef = MODEL_REGISTRY[model_id]
1928
+ resp = {
1929
  "ok": True,
1930
  "model": model_id,
1931
  "initialized_instances": count,
1932
  "lb_enabled": mdef.lb_enabled,
1933
  "pool_size_per_client": mdef.lb_pool_size,
1934
+ }
1935
+ if mdef.is_beta:
1936
+ resp["beta"] = True
1937
+ return jsonify(resp)
1938
 
1939
 
1940
  # ═══════════════════════════════════════════════════════════════
 
1947
  log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
1948
  log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
1949
  for mid, mdef in MODEL_REGISTRY.items():
1950
+ lb_str = (
1951
  f"LB ON (pool={mdef.lb_pool_size})"
1952
  if mdef.lb_enabled
1953
  else "LB OFF (single instance)"
1954
  )
1955
+ beta_str = " [BETA]" if mdef.is_beta else ""
1956
+ log.info(f" {mid}: {lb_str}{beta_str}")
1957
  app.run(host="0.0.0.0", port=port, threaded=True)