Files changed (1) hide show
  1. app.py +223 -64
app.py CHANGED
@@ -1,20 +1,21 @@
1
  #!/usr/bin/env python3
2
  """
3
  Multi-Model AI API β€” HuggingFace Spaces Edition
4
- With load balancing (multiple provider instances per model) and 10 req/s rate limiting.
5
  """
6
 
7
- import re, os, json, uuid, time, random, string, logging, threading
8
  from abc import ABC, abstractmethod
9
  from collections import deque
10
  from dataclasses import dataclass, field
11
  from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 
12
 
13
  import requests
14
  from flask import Flask, request as freq, jsonify, Response, stream_with_context
15
 
16
  try:
17
- from gradio_client import Client as GradioClient
18
  HAS_GRADIO_CLIENT = True
19
  except ImportError:
20
  HAS_GRADIO_CLIENT = False
@@ -23,7 +24,7 @@ except ImportError:
23
  # CONFIG & CONSTANTS
24
  # ═══════════════════════════════════════════════════════════════
25
 
26
- VERSION = "2.4.0-hf-lb"
27
  APP_NAME = "Multi-Model-AI-API"
28
  DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
29
  DEFAULT_MODEL = "gpt-oss-120b"
@@ -38,6 +39,83 @@ USER_AGENTS = [
38
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
39
  ]
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # ═══════════════════════════════════════════════════════════════
42
  # MODEL REGISTRY
43
  # ═══════════════════════════════════════════════════════════════
@@ -65,7 +143,8 @@ class ModelDef:
65
  clean_analysis: bool = False
66
  lb_pool_size: int = 2
67
  lb_enabled: bool = True
68
- is_beta: bool = False # Beta flag for experimental models
 
69
 
70
  MODEL_REGISTRY: Dict[str, ModelDef] = {}
71
 
@@ -101,9 +180,8 @@ def _init_registry():
101
  supports_temperature=False, supports_streaming=False, supports_history=False,
102
  supports_thinking=False, max_tokens_default=700,
103
  extra_params={"max_new_tokens": 700},
104
- lb_pool_size=1, lb_enabled=False, # NO load balancing for translate
105
  ))
106
- # ── NEW: Command-A Reasoning ──
107
  register_model(ModelDef(
108
  model_id="command-a-reasoning", display_name="Cohere Command-A Reasoning",
109
  provider_type="gradio_client", space_id="CohereLabs/command-a-reasoning",
@@ -153,7 +231,6 @@ def _init_registry():
153
  supports_thinking=False, max_tokens_default=4096,
154
  lb_pool_size=2, lb_enabled=True,
155
  ))
156
- # ── NEW: Qwen2.5-Coder (BETA) ──
157
  register_model(ModelDef(
158
  model_id="qwen2.5-coder", display_name="Qwen2.5-Coder Artifacts (BETA)",
159
  provider_type="gradio_client", space_id="Qwen/Qwen2.5-Coder-Artifacts",
@@ -194,7 +271,7 @@ class Config:
194
  rate_limit_burst: int = 15
195
  pool_size: int = 2
196
  max_history_messages: int = 50
197
- max_message_length: int = 10000
198
  default_temperature: float = 0.7
199
  include_thinking: bool = True
200
  log_sse_raw: bool = False
@@ -376,20 +453,15 @@ class ResponseCleaner:
376
 
377
  @classmethod
378
  def extract_qwen_coder_text(cls, result: Any) -> str:
379
- """Extract text from Qwen2.5-Coder /generation_code response.
380
- Returns tuple of (markdown, html). We want the markdown part."""
381
  if result is None:
382
  return ""
383
  if isinstance(result, str):
384
  return result.strip()
385
  if isinstance(result, tuple):
386
- # /generation_code returns (markdown_str, html_str)
387
- # We want the markdown part (index 0)
388
  if len(result) >= 1 and isinstance(result[0], str):
389
  text = result[0].strip()
390
  if text:
391
  return text
392
- # Fallback to second element if first is empty
393
  if len(result) >= 2 and isinstance(result[1], str):
394
  return result[1].strip()
395
  if isinstance(result, (list, dict)):
@@ -568,7 +640,7 @@ class Metrics:
568
  metrics = Metrics()
569
 
570
  # ═══════════════════════════════════════════════════════════════
571
- # RATE LIMITER β€” 10 requests per SECOND (token bucket)
572
  # ═══════════════════════════════════════════════════════════════
573
 
574
  class RateLimiter:
@@ -713,7 +785,7 @@ class ModelProvider(ABC):
713
 
714
  @abstractmethod
715
  def generate(self, message: str, history=None, system_prompt=None,
716
- temperature=None, max_tokens=None, **kwargs) -> str: ...
717
 
718
  def generate_stream(self, message: str, **kwargs) -> Generator[str, None, None]:
719
  yield self.generate(message, **kwargs)
@@ -795,7 +867,7 @@ class GptOssProvider(ModelProvider):
795
  return False
796
 
797
  def generate(self, message, history=None, system_prompt=None,
798
- temperature=None, max_tokens=None, **kw):
799
  if not self.ready:
800
  self.initialize()
801
  sys_p = system_prompt or self.config.default_system_prompt
@@ -848,7 +920,7 @@ class GptOssProvider(ModelProvider):
848
  if self.model_def.clean_analysis else full)
849
 
850
  def generate_stream(self, message, history=None, system_prompt=None,
851
- temperature=None, max_tokens=None, **kw):
852
  if not self.ready:
853
  self.initialize()
854
  sys_p = system_prompt or self.config.default_system_prompt
@@ -933,19 +1005,31 @@ class GradioClientProvider(ModelProvider):
933
  return False
934
 
935
  def generate(self, message, history=None, system_prompt=None,
936
- temperature=None, max_tokens=None, **kw):
937
  if not self.ready:
938
  self.initialize()
939
  if not self._client:
940
  raise APIError(f"{self.model_def.model_id} not initialized")
941
 
942
  mid = self.model_def.model_id
 
 
943
  try:
944
  if mid == "command-a-vision":
945
  max_new = (max_tokens
946
  or self.model_def.extra_params.get("max_new_tokens", 700))
 
 
 
 
 
 
 
 
 
 
947
  result = self._client.predict(
948
- message={"text": message, "files": []},
949
  max_new_tokens=max_new,
950
  api_name=self.model_def.api_name,
951
  )
@@ -960,7 +1044,6 @@ class GradioClientProvider(ModelProvider):
960
  )
961
 
962
  elif mid == "command-a-reasoning":
963
- # Cohere Command-A Reasoning with thinking budget
964
  thinking_budget = kw.get(
965
  "thinking_budget",
966
  self.model_def.extra_params.get("thinking_budget", 500),
@@ -979,8 +1062,14 @@ class GradioClientProvider(ModelProvider):
979
  or self.model_def.extra_params.get("max_tokens", 12800))
980
  top_p = kw.get("top_p",
981
  self.model_def.extra_params.get("top_p", 0.9))
 
 
 
 
 
 
982
  result = self._client.predict(
983
- message={"text": message, "files": []},
984
  max_tokens=max_tok, temperature=temp, top_p=top_p,
985
  api_name=self.model_def.api_name,
986
  )
@@ -1019,14 +1108,22 @@ class GradioClientProvider(ModelProvider):
1019
  return ResponseCleaner.extract_chatgpt_text(result)
1020
 
1021
  elif mid == "qwen3-vl":
1022
- result = self._client.predict(
1023
- input_value={"files": None, "text": message},
1024
- api_name="/add_message",
1025
- )
 
 
 
 
 
 
 
 
 
1026
  return ResponseCleaner.extract_qwen_text(result)
1027
 
1028
  elif mid == "qwen2.5-coder":
1029
- # First set the system prompt to override artifacts behavior
1030
  sys_override = self.model_def.extra_params.get(
1031
  "system_prompt_override", ""
1032
  )
@@ -1062,33 +1159,27 @@ class GradioClientProvider(ModelProvider):
1062
  raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
1063
 
1064
  def _extract_reasoning(self, result: Any) -> str:
1065
- """Extract response from Command-A Reasoning.
1066
- The API returns str | float | bool | list | dict from the Json component."""
1067
  if result is None:
1068
  return ""
1069
  if isinstance(result, str):
1070
  return result.strip()
1071
  if isinstance(result, dict):
1072
- # Try common response keys
1073
  for key in ("response", "output", "answer", "text", "content", "result"):
1074
  if key in result:
1075
  val = result[key]
1076
  if isinstance(val, str):
1077
  return val.strip()
1078
  return str(val)
1079
- # Check for thinking + response structure
1080
  thinking = result.get("thinking", "")
1081
  response = result.get("response", result.get("output", ""))
1082
  if thinking and response:
1083
  return f"<thinking>\n{thinking}\n</thinking>\n{response}"
1084
  if response:
1085
  return str(response).strip()
1086
- # Fallback: serialize entire dict
1087
  return json.dumps(result, ensure_ascii=False, indent=2)
1088
  if isinstance(result, (list, tuple)):
1089
  if len(result) == 1:
1090
  return str(result[0]).strip()
1091
- # Try to find text in list elements
1092
  texts = []
1093
  for item in result:
1094
  if isinstance(item, str) and item.strip():
@@ -1120,7 +1211,6 @@ class GradioClientProvider(ModelProvider):
1120
  return ResponseCleaner.clean_glm(str(result), include_thinking)
1121
 
1122
 
1123
- # Factory
1124
  def create_provider(model_id: str, config: Config,
1125
  instance_id: int = 0) -> ModelProvider:
1126
  if model_id not in MODEL_REGISTRY:
@@ -1253,7 +1343,7 @@ class LoadBalancedProviderPool:
1253
  inst.record_failure()
1254
  log.warning(
1255
  f"[LB] Failover instance {inst.instance_id} "
1256
- f"for '{self.model_id}' also failed: {e}"
1257
  )
1258
 
1259
  raise APIError(
@@ -1336,7 +1426,7 @@ class LoadBalancedProviderPool:
1336
  }
1337
 
1338
  # ═══════════════════════════════════════════════════════════════
1339
- # MULTI-MODEL CLIENT (with load balancing)
1340
  # ═══════════════════════════════════════════════════════════════
1341
 
1342
  class MultiModelClient:
@@ -1415,7 +1505,7 @@ class MultiModelClient:
1415
 
1416
  def send_message(
1417
  self,
1418
- message: str,
1419
  *,
1420
  stream: bool = False,
1421
  model: Optional[str] = None,
@@ -1424,14 +1514,27 @@ class MultiModelClient:
1424
  temperature: Optional[float] = None,
1425
  max_tokens: Optional[int] = None,
1426
  include_thinking: Optional[bool] = None,
 
1427
  **kwargs,
1428
  ) -> Union[str, Generator]:
1429
  model_id = model or self._current_model
1430
  if model_id not in MODEL_REGISTRY:
1431
  raise ModelNotFoundError(model_id)
1432
  mdef = MODEL_REGISTRY[model_id]
1433
- message = message.strip()
1434
- if not message:
 
 
 
 
 
 
 
 
 
 
 
 
1435
  raise APIError("Empty message", "INVALID_INPUT", 400)
1436
  if len(message) > self.config.max_message_length:
1437
  raise APIError("Message too long", "INVALID_INPUT", 400)
@@ -1479,6 +1582,7 @@ class MultiModelClient:
1479
  system_prompt=eff_sys,
1480
  temperature=eff_temp,
1481
  max_tokens=max_tokens,
 
1482
  **extra,
1483
  )
1484
  return self._wrap_stream(gen, conv, start, model_id)
@@ -1490,6 +1594,7 @@ class MultiModelClient:
1490
  system_prompt=eff_sys,
1491
  temperature=eff_temp,
1492
  max_tokens=max_tokens,
 
1493
  **extra,
1494
  )
1495
  dur = (time.monotonic() - start) * 1000
@@ -1581,7 +1686,7 @@ class SessionPool:
1581
  return c
1582
 
1583
  # ═══════════════════════════════════════════════════════════════
1584
- # MODEL ALIAS RESOLVER
1585
  # ═══════════════════════════════════════════════════════════════
1586
 
1587
  ALIASES = {
@@ -1602,6 +1707,8 @@ ALIASES = {
1602
 
1603
 
1604
  def resolve_alias(model_id: str) -> str:
 
 
1605
  return ALIASES.get(model_id.lower(), model_id)
1606
 
1607
  # ═══════════════════════════════════════════════════════════════
@@ -1634,18 +1741,19 @@ def index():
1634
  "name": APP_NAME,
1635
  "version": VERSION,
1636
  "default_model": config.default_model,
1637
- "features": ["load_balancing", "10_req_per_second_limit", "failover"],
1638
  "models": list(MODEL_REGISTRY.keys()),
1639
  "beta_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.is_beta],
 
1640
  "endpoints": {
1641
  "POST /chat": "Chat with any model",
1642
  "POST /chat/stream": "Streaming chat",
1643
- "POST /v1/chat/completions": "OpenAI-compatible",
1644
  "GET /v1/models": "List models",
1645
  "POST /models/init": "Init a model",
1646
- "GET /health": "Health check (incl. LB status)",
1647
  "GET /metrics": "Metrics",
1648
- "GET /lb/status": "Load balancer detailed status",
1649
  },
1650
  })
1651
 
@@ -1653,16 +1761,26 @@ def index():
1653
  @app.route("/chat", methods=["POST"])
1654
  def chat():
1655
  data = freq.get_json(force=True, silent=True) or {}
1656
- message = data.get("message", "").strip()
1657
- if not message:
 
 
 
 
 
 
 
 
 
 
1658
  return jsonify({"ok": False, "error": "'message' required"}), 400
 
1659
  model_id = resolve_alias(data.get("model", config.default_model))
1660
  include_thinking = data.get("include_thinking", config.include_thinking)
1661
  client = pool.acquire()
1662
  if data.get("new_conversation"):
1663
  client.new_conversation(data.get("system_prompt"), model_id)
1664
 
1665
- # Pass extra params for specific models
1666
  extra = {}
1667
  if model_id == "command-a-reasoning" and "thinking_budget" in data:
1668
  extra["thinking_budget"] = data["thinking_budget"]
@@ -1673,6 +1791,7 @@ def chat():
1673
  temperature=data.get("temperature"),
1674
  max_tokens=data.get("max_tokens"),
1675
  include_thinking=include_thinking,
 
1676
  **extra,
1677
  )
1678
  thinking, clean = ThinkingParser.split(result)
@@ -1694,9 +1813,19 @@ def chat():
1694
  @app.route("/chat/stream", methods=["POST"])
1695
  def chat_stream():
1696
  data = freq.get_json(force=True, silent=True) or {}
1697
- message = data.get("message", "").strip()
1698
- if not message:
 
 
 
 
 
 
 
 
 
1699
  return jsonify({"ok": False, "error": "'message' required"}), 400
 
1700
  model_id = resolve_alias(data.get("model", config.default_model))
1701
  include_thinking = data.get("include_thinking", config.include_thinking)
1702
  client = pool.acquire()
@@ -1718,6 +1847,7 @@ def chat_stream():
1718
  temperature=data.get("temperature"),
1719
  max_tokens=data.get("max_tokens"),
1720
  include_thinking=include_thinking,
 
1721
  **extra,
1722
  ):
1723
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
@@ -1728,6 +1858,7 @@ def chat_stream():
1728
  temperature=data.get("temperature"),
1729
  max_tokens=data.get("max_tokens"),
1730
  include_thinking=include_thinking,
 
1731
  **extra,
1732
  )
1733
  yield f"data: {json.dumps({'chunk': result})}\n\n"
@@ -1772,6 +1903,7 @@ def list_models():
1772
  def openai_compat():
1773
  if freq.method == "OPTIONS":
1774
  return "", 200
 
1775
  data = freq.get_json(force=True, silent=True) or {}
1776
  messages = data.get("messages", [])
1777
  do_stream = data.get("stream", False)
@@ -1783,20 +1915,38 @@ def openai_compat():
1783
  if model_id not in MODEL_REGISTRY:
1784
  return jsonify({
1785
  "error": {
1786
- "message": f"Model '{model_id}' not found",
1787
  "type": "invalid_request_error",
 
1788
  }
1789
  }), 404
 
1790
  if not messages:
1791
  return jsonify({"error": {"message": "messages required"}}), 400
1792
 
1793
- user_msg = system_prompt = None
 
 
 
 
1794
  for msg in messages:
1795
- if msg.get("role") == "system":
1796
- system_prompt = msg.get("content")
1797
- if msg.get("role") == "user":
1798
- user_msg = msg.get("content", "")
1799
- if not user_msg:
 
 
 
 
 
 
 
 
 
 
 
 
1800
  return jsonify({"error": {"message": "No user message"}}), 400
1801
 
1802
  rid = f"chatcmpl-{uuid.uuid4().hex[:29]}"
@@ -1804,15 +1954,21 @@ def openai_compat():
1804
  client = pool.acquire()
1805
  client.new_conversation(system_prompt, model_id)
1806
 
 
1807
  for msg in messages[:-1]:
1808
  role = msg.get("role")
1809
  content = msg.get("content", "")
1810
  if role in ("user", "assistant") and content:
1811
- client.active_conversation.add_message(role, content)
 
 
 
 
 
 
1812
 
1813
  mdef = MODEL_REGISTRY[model_id]
1814
 
1815
- # Extra params
1816
  extra = {}
1817
  if model_id == "command-a-reasoning" and "thinking_budget" in data:
1818
  extra["thinking_budget"] = data["thinking_budget"]
@@ -1825,14 +1981,15 @@ def openai_compat():
1825
  for chunk in client.send_message(
1826
  user_msg, stream=True, model=model_id,
1827
  temperature=temperature, max_tokens=max_tokens,
1828
- include_thinking=include_thinking, **extra,
 
1829
  ):
1830
  yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
1831
  else:
1832
  result = client.send_message(
1833
  user_msg, model=model_id, temperature=temperature,
1834
- max_tokens=max_tokens,
1835
- include_thinking=include_thinking, **extra,
1836
  )
1837
  yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': result}, 'finish_reason': None}]})}\n\n"
1838
  yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
@@ -1845,7 +2002,8 @@ def openai_compat():
1845
 
1846
  result = client.send_message(
1847
  user_msg, model=model_id, temperature=temperature,
1848
- max_tokens=max_tokens, include_thinking=include_thinking, **extra,
 
1849
  )
1850
  return jsonify({
1851
  "id": rid,
@@ -1943,7 +2101,7 @@ def init_model_ep():
1943
 
1944
  if __name__ == "__main__":
1945
  port = int(os.environ.get("PORT", 7860))
1946
- log.info(f"Starting Multi-Model AI API v{VERSION} on port {port}")
1947
  log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
1948
  log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
1949
  for mid, mdef in MODEL_REGISTRY.items():
@@ -1952,6 +2110,7 @@ if __name__ == "__main__":
1952
  if mdef.lb_enabled
1953
  else "LB OFF (single instance)"
1954
  )
 
1955
  beta_str = " [BETA]" if mdef.is_beta else ""
1956
- log.info(f" {mid}: {lb_str}{beta_str}")
1957
  app.run(host="0.0.0.0", port=port, threaded=True)
 
1
  #!/usr/bin/env python3
2
  """
3
  Multi-Model AI API β€” HuggingFace Spaces Edition
4
+ With load balancing, 10 req/s rate limiting, vision support, and multimodal fixes.
5
  """
6
 
7
+ import re, os, json, uuid, time, random, string, logging, threading, base64
8
  from abc import ABC, abstractmethod
9
  from collections import deque
10
  from dataclasses import dataclass, field
11
  from typing import Any, Dict, Generator, List, Optional, Tuple, Union
12
+ from io import BytesIO
13
 
14
  import requests
15
  from flask import Flask, request as freq, jsonify, Response, stream_with_context
16
 
17
  try:
18
+ from gradio_client import Client as GradioClient, handle_file
19
  HAS_GRADIO_CLIENT = True
20
  except ImportError:
21
  HAS_GRADIO_CLIENT = False
 
24
  # CONFIG & CONSTANTS
25
  # ═══════════════════════════════════════════════════════════════
26
 
27
+ VERSION = "3.0.0-hf-lb"
28
  APP_NAME = "Multi-Model-AI-API"
29
  DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
30
  DEFAULT_MODEL = "gpt-oss-120b"
 
39
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
40
  ]
41
 
42
+ # ═══════════════════════════════════════════════════════════════
43
+ # MULTIMODAL HELPERS
44
+ # ═══════════════════════════════════════════════════════════════
45
+
46
+ def extract_text_and_images(content: Any) -> Tuple[str, List[str]]:
47
+ """
48
+ Parse OpenAI-style multimodal content.
49
+ Returns (text, [base64_or_url, ...])
50
+ Handles: str, list of {type, text/image_url}
51
+ """
52
+ if content is None:
53
+ return "", []
54
+ if isinstance(content, str):
55
+ return content.strip(), []
56
+
57
+ texts: List[str] = []
58
+ images: List[str] = []
59
+
60
+ if isinstance(content, list):
61
+ for block in content:
62
+ if not isinstance(block, dict):
63
+ texts.append(str(block))
64
+ continue
65
+ btype = block.get("type", "")
66
+ if btype == "text":
67
+ t = block.get("text", "")
68
+ if t:
69
+ texts.append(t)
70
+ elif btype == "image_url":
71
+ img = block.get("image_url", {})
72
+ url = img.get("url", "") if isinstance(img, dict) else str(img)
73
+ if url:
74
+ images.append(url)
75
+ elif btype == "image":
76
+ # Alternative format
77
+ src = block.get("source", {})
78
+ if isinstance(src, dict):
79
+ data = src.get("data", "")
80
+ if data:
81
+ media = src.get("media_type", "image/jpeg")
82
+ images.append(f"data:{media};base64,{data}")
83
+
84
+ return " ".join(texts).strip(), images
85
+
86
+
87
+ def decode_image_to_bytes(image_url: str) -> Optional[Tuple[bytes, str]]:
88
+ """Convert image URL or data URI to (bytes, media_type)."""
89
+ try:
90
+ if image_url.startswith("data:"):
91
+ # data:image/jpeg;base64,/9j/...
92
+ header, data = image_url.split(",", 1)
93
+ media_type = header.split(";")[0].split(":")[1]
94
+ return base64.b64decode(data), media_type
95
+ else:
96
+ # Remote URL
97
+ r = requests.get(image_url, timeout=15)
98
+ r.raise_for_status()
99
+ ct = r.headers.get("content-type", "image/jpeg").split(";")[0]
100
+ return r.content, ct
101
+ except Exception as e:
102
+ log.warning(f"Failed to decode image: {e}")
103
+ return None
104
+
105
+
106
+ def save_image_temp(image_url: str) -> Optional[str]:
107
+ """Save image to a temp file and return path (for gradio_client)."""
108
+ import tempfile
109
+ result = decode_image_to_bytes(image_url)
110
+ if not result:
111
+ return None
112
+ data, media_type = result
113
+ ext = media_type.split("/")[-1].replace("jpeg", "jpg")
114
+ with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
115
+ f.write(data)
116
+ return f.name
117
+
118
+
119
  # ═══════════════════════════════════════════════════════════════
120
  # MODEL REGISTRY
121
  # ═══════════════════════════════════════════════════════════════
 
143
  clean_analysis: bool = False
144
  lb_pool_size: int = 2
145
  lb_enabled: bool = True
146
+ is_beta: bool = False
147
+
148
 
149
  MODEL_REGISTRY: Dict[str, ModelDef] = {}
150
 
 
180
  supports_temperature=False, supports_streaming=False, supports_history=False,
181
  supports_thinking=False, max_tokens_default=700,
182
  extra_params={"max_new_tokens": 700},
183
+ lb_pool_size=1, lb_enabled=False,
184
  ))
 
185
  register_model(ModelDef(
186
  model_id="command-a-reasoning", display_name="Cohere Command-A Reasoning",
187
  provider_type="gradio_client", space_id="CohereLabs/command-a-reasoning",
 
231
  supports_thinking=False, max_tokens_default=4096,
232
  lb_pool_size=2, lb_enabled=True,
233
  ))
 
234
  register_model(ModelDef(
235
  model_id="qwen2.5-coder", display_name="Qwen2.5-Coder Artifacts (BETA)",
236
  provider_type="gradio_client", space_id="Qwen/Qwen2.5-Coder-Artifacts",
 
271
  rate_limit_burst: int = 15
272
  pool_size: int = 2
273
  max_history_messages: int = 50
274
+ max_message_length: int = 32000
275
  default_temperature: float = 0.7
276
  include_thinking: bool = True
277
  log_sse_raw: bool = False
 
453
 
454
  @classmethod
455
  def extract_qwen_coder_text(cls, result: Any) -> str:
 
 
456
  if result is None:
457
  return ""
458
  if isinstance(result, str):
459
  return result.strip()
460
  if isinstance(result, tuple):
 
 
461
  if len(result) >= 1 and isinstance(result[0], str):
462
  text = result[0].strip()
463
  if text:
464
  return text
 
465
  if len(result) >= 2 and isinstance(result[1], str):
466
  return result[1].strip()
467
  if isinstance(result, (list, dict)):
 
640
  metrics = Metrics()
641
 
642
  # ═══════════════════════════════════════════════════════════════
643
+ # RATE LIMITER β€” token bucket (10 req/s)
644
  # ═══════════════════════════════════════════════════════════════
645
 
646
  class RateLimiter:
 
785
 
786
  @abstractmethod
787
  def generate(self, message: str, history=None, system_prompt=None,
788
+ temperature=None, max_tokens=None, images=None, **kwargs) -> str: ...
789
 
790
  def generate_stream(self, message: str, **kwargs) -> Generator[str, None, None]:
791
  yield self.generate(message, **kwargs)
 
867
  return False
868
 
869
  def generate(self, message, history=None, system_prompt=None,
870
+ temperature=None, max_tokens=None, images=None, **kw):
871
  if not self.ready:
872
  self.initialize()
873
  sys_p = system_prompt or self.config.default_system_prompt
 
920
  if self.model_def.clean_analysis else full)
921
 
922
  def generate_stream(self, message, history=None, system_prompt=None,
923
+ temperature=None, max_tokens=None, images=None, **kw):
924
  if not self.ready:
925
  self.initialize()
926
  sys_p = system_prompt or self.config.default_system_prompt
 
1005
  return False
1006
 
1007
  def generate(self, message, history=None, system_prompt=None,
1008
+ temperature=None, max_tokens=None, images=None, **kw):
1009
  if not self.ready:
1010
  self.initialize()
1011
  if not self._client:
1012
  raise APIError(f"{self.model_def.model_id} not initialized")
1013
 
1014
  mid = self.model_def.model_id
1015
+ images = images or []
1016
+
1017
  try:
1018
  if mid == "command-a-vision":
1019
  max_new = (max_tokens
1020
  or self.model_def.extra_params.get("max_new_tokens", 700))
1021
+ # Build multimodal message
1022
+ msg_payload: Any
1023
+ if images:
1024
+ img_path = save_image_temp(images[0])
1025
+ if img_path:
1026
+ msg_payload = {"text": message, "files": [handle_file(img_path)]}
1027
+ else:
1028
+ msg_payload = {"text": message, "files": []}
1029
+ else:
1030
+ msg_payload = {"text": message, "files": []}
1031
  result = self._client.predict(
1032
+ message=msg_payload,
1033
  max_new_tokens=max_new,
1034
  api_name=self.model_def.api_name,
1035
  )
 
1044
  )
1045
 
1046
  elif mid == "command-a-reasoning":
 
1047
  thinking_budget = kw.get(
1048
  "thinking_budget",
1049
  self.model_def.extra_params.get("thinking_budget", 500),
 
1062
  or self.model_def.extra_params.get("max_tokens", 12800))
1063
  top_p = kw.get("top_p",
1064
  self.model_def.extra_params.get("top_p", 0.9))
1065
+ # Vision support
1066
+ if images:
1067
+ img_path = save_image_temp(images[0])
1068
+ files = [handle_file(img_path)] if img_path else []
1069
+ else:
1070
+ files = []
1071
  result = self._client.predict(
1072
+ message={"text": message, "files": files},
1073
  max_tokens=max_tok, temperature=temp, top_p=top_p,
1074
  api_name=self.model_def.api_name,
1075
  )
 
1108
  return ResponseCleaner.extract_chatgpt_text(result)
1109
 
1110
  elif mid == "qwen3-vl":
1111
+ # Vision support
1112
+ if images:
1113
+ img_path = save_image_temp(images[0])
1114
+ files = [handle_file(img_path)] if img_path else []
1115
+ result = self._client.predict(
1116
+ input_value={"files": files, "text": message},
1117
+ api_name="/add_message",
1118
+ )
1119
+ else:
1120
+ result = self._client.predict(
1121
+ input_value={"files": None, "text": message},
1122
+ api_name="/add_message",
1123
+ )
1124
  return ResponseCleaner.extract_qwen_text(result)
1125
 
1126
  elif mid == "qwen2.5-coder":
 
1127
  sys_override = self.model_def.extra_params.get(
1128
  "system_prompt_override", ""
1129
  )
 
1159
  raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
1160
 
1161
  def _extract_reasoning(self, result: Any) -> str:
 
 
1162
  if result is None:
1163
  return ""
1164
  if isinstance(result, str):
1165
  return result.strip()
1166
  if isinstance(result, dict):
 
1167
  for key in ("response", "output", "answer", "text", "content", "result"):
1168
  if key in result:
1169
  val = result[key]
1170
  if isinstance(val, str):
1171
  return val.strip()
1172
  return str(val)
 
1173
  thinking = result.get("thinking", "")
1174
  response = result.get("response", result.get("output", ""))
1175
  if thinking and response:
1176
  return f"<thinking>\n{thinking}\n</thinking>\n{response}"
1177
  if response:
1178
  return str(response).strip()
 
1179
  return json.dumps(result, ensure_ascii=False, indent=2)
1180
  if isinstance(result, (list, tuple)):
1181
  if len(result) == 1:
1182
  return str(result[0]).strip()
 
1183
  texts = []
1184
  for item in result:
1185
  if isinstance(item, str) and item.strip():
 
1211
  return ResponseCleaner.clean_glm(str(result), include_thinking)
1212
 
1213
 
 
1214
  def create_provider(model_id: str, config: Config,
1215
  instance_id: int = 0) -> ModelProvider:
1216
  if model_id not in MODEL_REGISTRY:
 
1343
  inst.record_failure()
1344
  log.warning(
1345
  f"[LB] Failover instance {inst.instance_id} "
1346
+ f"for '{self.model_id}' failed: {e}"
1347
  )
1348
 
1349
  raise APIError(
 
1426
  }
1427
 
1428
  # ═══════════════════════════════════════════════════════════════
1429
+ # MULTI-MODEL CLIENT
1430
  # ═══════════════════════════════════════════════════════════════
1431
 
1432
  class MultiModelClient:
 
1505
 
1506
  def send_message(
1507
  self,
1508
+ message: Any, # str OR list (multimodal)
1509
  *,
1510
  stream: bool = False,
1511
  model: Optional[str] = None,
 
1514
  temperature: Optional[float] = None,
1515
  max_tokens: Optional[int] = None,
1516
  include_thinking: Optional[bool] = None,
1517
+ images: Optional[List[str]] = None,
1518
  **kwargs,
1519
  ) -> Union[str, Generator]:
1520
  model_id = model or self._current_model
1521
  if model_id not in MODEL_REGISTRY:
1522
  raise ModelNotFoundError(model_id)
1523
  mdef = MODEL_REGISTRY[model_id]
1524
+
1525
+ # ── Normalise multimodal content ──────────────────────
1526
+ if isinstance(message, list):
1527
+ text, extracted_images = extract_text_and_images(message)
1528
+ if not images:
1529
+ images = extracted_images
1530
+ message = text
1531
+
1532
+ if isinstance(message, str):
1533
+ message = message.strip()
1534
+ else:
1535
+ message = str(message).strip()
1536
+
1537
+ if not message and not images:
1538
  raise APIError("Empty message", "INVALID_INPUT", 400)
1539
  if len(message) > self.config.max_message_length:
1540
  raise APIError("Message too long", "INVALID_INPUT", 400)
 
1582
  system_prompt=eff_sys,
1583
  temperature=eff_temp,
1584
  max_tokens=max_tokens,
1585
+ images=images,
1586
  **extra,
1587
  )
1588
  return self._wrap_stream(gen, conv, start, model_id)
 
1594
  system_prompt=eff_sys,
1595
  temperature=eff_temp,
1596
  max_tokens=max_tokens,
1597
+ images=images,
1598
  **extra,
1599
  )
1600
  dur = (time.monotonic() - start) * 1000
 
1686
  return c
1687
 
1688
  # ═══════════════════════════════════════════════════════════════
1689
+ # ALIAS RESOLVER
1690
  # ═══════════════════════════════════════════════════════════════
1691
 
1692
  ALIASES = {
 
1707
 
1708
 
1709
  def resolve_alias(model_id: str) -> str:
1710
+ if not model_id:
1711
+ return config.default_model
1712
  return ALIASES.get(model_id.lower(), model_id)
1713
 
1714
  # ═══════════════════════════════════════════════════════════════
 
1741
  "name": APP_NAME,
1742
  "version": VERSION,
1743
  "default_model": config.default_model,
1744
+ "features": ["load_balancing", "10_req_per_second_limit", "failover", "vision"],
1745
  "models": list(MODEL_REGISTRY.keys()),
1746
  "beta_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.is_beta],
1747
+ "vision_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.supports_vision],
1748
  "endpoints": {
1749
  "POST /chat": "Chat with any model",
1750
  "POST /chat/stream": "Streaming chat",
1751
+ "POST /v1/chat/completions": "OpenAI-compatible (supports vision)",
1752
  "GET /v1/models": "List models",
1753
  "POST /models/init": "Init a model",
1754
+ "GET /health": "Health check",
1755
  "GET /metrics": "Metrics",
1756
+ "GET /lb/status": "Load balancer status",
1757
  },
1758
  })
1759
 
 
1761
  @app.route("/chat", methods=["POST"])
1762
  def chat():
1763
  data = freq.get_json(force=True, silent=True) or {}
1764
+ raw_message = data.get("message", "")
1765
+ images = data.get("images", [])
1766
+
1767
+ # Support multimodal content directly in message field
1768
+ if isinstance(raw_message, list):
1769
+ text, extracted = extract_text_and_images(raw_message)
1770
+ images = images or extracted
1771
+ message = text
1772
+ else:
1773
+ message = str(raw_message).strip()
1774
+
1775
+ if not message and not images:
1776
  return jsonify({"ok": False, "error": "'message' required"}), 400
1777
+
1778
  model_id = resolve_alias(data.get("model", config.default_model))
1779
  include_thinking = data.get("include_thinking", config.include_thinking)
1780
  client = pool.acquire()
1781
  if data.get("new_conversation"):
1782
  client.new_conversation(data.get("system_prompt"), model_id)
1783
 
 
1784
  extra = {}
1785
  if model_id == "command-a-reasoning" and "thinking_budget" in data:
1786
  extra["thinking_budget"] = data["thinking_budget"]
 
1791
  temperature=data.get("temperature"),
1792
  max_tokens=data.get("max_tokens"),
1793
  include_thinking=include_thinking,
1794
+ images=images or None,
1795
  **extra,
1796
  )
1797
  thinking, clean = ThinkingParser.split(result)
 
1813
  @app.route("/chat/stream", methods=["POST"])
1814
  def chat_stream():
1815
  data = freq.get_json(force=True, silent=True) or {}
1816
+ raw_message = data.get("message", "")
1817
+ images = data.get("images", [])
1818
+
1819
+ if isinstance(raw_message, list):
1820
+ text, extracted = extract_text_and_images(raw_message)
1821
+ images = images or extracted
1822
+ message = text
1823
+ else:
1824
+ message = str(raw_message).strip()
1825
+
1826
+ if not message and not images:
1827
  return jsonify({"ok": False, "error": "'message' required"}), 400
1828
+
1829
  model_id = resolve_alias(data.get("model", config.default_model))
1830
  include_thinking = data.get("include_thinking", config.include_thinking)
1831
  client = pool.acquire()
 
1847
  temperature=data.get("temperature"),
1848
  max_tokens=data.get("max_tokens"),
1849
  include_thinking=include_thinking,
1850
+ images=images or None,
1851
  **extra,
1852
  ):
1853
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
 
1858
  temperature=data.get("temperature"),
1859
  max_tokens=data.get("max_tokens"),
1860
  include_thinking=include_thinking,
1861
+ images=images or None,
1862
  **extra,
1863
  )
1864
  yield f"data: {json.dumps({'chunk': result})}\n\n"
 
1903
  def openai_compat():
1904
  if freq.method == "OPTIONS":
1905
  return "", 200
1906
+
1907
  data = freq.get_json(force=True, silent=True) or {}
1908
  messages = data.get("messages", [])
1909
  do_stream = data.get("stream", False)
 
1915
  if model_id not in MODEL_REGISTRY:
1916
  return jsonify({
1917
  "error": {
1918
+ "message": f"Model '{model_id}' not found. Available: {list(MODEL_REGISTRY.keys())}",
1919
  "type": "invalid_request_error",
1920
+ "available_models": list(MODEL_REGISTRY.keys()),
1921
  }
1922
  }), 404
1923
+
1924
  if not messages:
1925
  return jsonify({"error": {"message": "messages required"}}), 400
1926
 
1927
+ # ── Extract user message, system prompt, and images ───────
1928
+ user_msg: str = ""
1929
+ system_prompt: Optional[str] = None
1930
+ images: List[str] = []
1931
+
1932
  for msg in messages:
1933
+ role = msg.get("role", "")
1934
+ content = msg.get("content", "")
1935
+
1936
+ if role == "system":
1937
+ system_prompt = content if isinstance(content, str) else str(content)
1938
+
1939
+ if role == "user":
1940
+ if isinstance(content, list):
1941
+ text, imgs = extract_text_and_images(content)
1942
+ user_msg = text
1943
+ images.extend(imgs)
1944
+ elif isinstance(content, str):
1945
+ user_msg = content
1946
+ else:
1947
+ user_msg = str(content)
1948
+
1949
+ if not user_msg and not images:
1950
  return jsonify({"error": {"message": "No user message"}}), 400
1951
 
1952
  rid = f"chatcmpl-{uuid.uuid4().hex[:29]}"
 
1954
  client = pool.acquire()
1955
  client.new_conversation(system_prompt, model_id)
1956
 
1957
+ # Replay history (all but the last user message)
1958
  for msg in messages[:-1]:
1959
  role = msg.get("role")
1960
  content = msg.get("content", "")
1961
  if role in ("user", "assistant") and content:
1962
+ text = (
1963
+ extract_text_and_images(content)[0]
1964
+ if isinstance(content, list)
1965
+ else str(content)
1966
+ )
1967
+ if text:
1968
+ client.active_conversation.add_message(role, text)
1969
 
1970
  mdef = MODEL_REGISTRY[model_id]
1971
 
 
1972
  extra = {}
1973
  if model_id == "command-a-reasoning" and "thinking_budget" in data:
1974
  extra["thinking_budget"] = data["thinking_budget"]
 
1981
  for chunk in client.send_message(
1982
  user_msg, stream=True, model=model_id,
1983
  temperature=temperature, max_tokens=max_tokens,
1984
+ include_thinking=include_thinking,
1985
+ images=images or None, **extra,
1986
  ):
1987
  yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
1988
  else:
1989
  result = client.send_message(
1990
  user_msg, model=model_id, temperature=temperature,
1991
+ max_tokens=max_tokens, include_thinking=include_thinking,
1992
+ images=images or None, **extra,
1993
  )
1994
  yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': result}, 'finish_reason': None}]})}\n\n"
1995
  yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
 
2002
 
2003
  result = client.send_message(
2004
  user_msg, model=model_id, temperature=temperature,
2005
+ max_tokens=max_tokens, include_thinking=include_thinking,
2006
+ images=images or None, **extra,
2007
  )
2008
  return jsonify({
2009
  "id": rid,
 
2101
 
2102
  if __name__ == "__main__":
2103
  port = int(os.environ.get("PORT", 7860))
2104
+ log.info(f"Starting {APP_NAME} v{VERSION} on port {port}")
2105
  log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
2106
  log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
2107
  for mid, mdef in MODEL_REGISTRY.items():
 
2110
  if mdef.lb_enabled
2111
  else "LB OFF (single instance)"
2112
  )
2113
+ vision_str = " [VISION]" if mdef.supports_vision else ""
2114
  beta_str = " [BETA]" if mdef.is_beta else ""
2115
+ log.info(f" {mid}: {lb_str}{vision_str}{beta_str}")
2116
  app.run(host="0.0.0.0", port=port, threaded=True)