Spaces:

MB-IDK
/

G4F

Running

App Files Files Community

MB-IDK commited on Mar 6

Commit

f6799a8

verified ·

1 Parent(s): 1cd3ed6

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -107

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ except ImportError:
 #  CONFIG & CONSTANTS
 # ═══════════════════════════════════════════════════════════════
-VERSION = "2.3.0-hf-lb"
 APP_NAME = "Multi-Model-AI-API"
 DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
 DEFAULT_MODEL = "gpt-oss-120b"
@@ -63,9 +63,9 @@ class ModelDef:
     api_name: Optional[str] = None
     extra_params: Dict[str, Any] = field(default_factory=dict)
     clean_analysis: bool = False
-    # Load balancing config per model
-    lb_pool_size: int = 2  # number of provider instances for load balancing
-    lb_enabled: bool = True  # whether load balancing is enabled
 MODEL_REGISTRY: Dict[str, ModelDef] = {}
@@ -103,6 +103,17 @@ def _init_registry():
         extra_params={"max_new_tokens": 700},
         lb_pool_size=1, lb_enabled=False,  # NO load balancing for translate
     ))
     register_model(ModelDef(
         model_id="minimax-vl-01", display_name="MiniMax VL-01",
         provider_type="gradio_client", space_id="MiniMaxAI/MiniMax-VL-01",
@@ -142,6 +153,27 @@ def _init_registry():
         supports_thinking=False, max_tokens_default=4096,
         lb_pool_size=2, lb_enabled=True,
     ))
 _init_registry()
@@ -158,8 +190,8 @@ class Config:
     max_retries: int = 3
     retry_backoff_base: float = 1.5
     retry_jitter: float = 0.5
-    rate_limit_rps: int = 10  # requests per SECOND (changed from RPM)
-    rate_limit_burst: int = 15  # burst capacity
     pool_size: int = 2
     max_history_messages: int = 50
     max_message_length: int = 10000
@@ -342,6 +374,28 @@ class ResponseCleaner:
             return str(chatbot).strip() if chatbot else ""
         return str(result)
     @classmethod
     def clean(cls, text: str, model_id: str = "",
               include_thinking: bool = True) -> str:
@@ -459,7 +513,6 @@ class Metrics:
     requests_per_model: Dict[str, int] = field(default_factory=dict)
     _latencies: deque = field(default_factory=lambda: deque(maxlen=1000), repr=False)
     started_at: float = field(default_factory=time.time)
-    # Load balancer metrics
     lb_total_dispatches: int = 0
     lb_failovers: int = 0
@@ -519,10 +572,8 @@ metrics = Metrics()
 # ═══════════════════════════════════════════════════════════════
 class RateLimiter:
-    """Token-bucket rate limiter. Default: 10 requests/second with burst."""
     def __init__(self, rps: int = 10, burst: int = 15):
-        self.rate = float(rps)           # tokens per second
         self.max_tokens = float(burst)
         self.tokens = float(burst)
         self.last_refill = time.monotonic()
@@ -544,7 +595,7 @@ class RateLimiter:
                     return True
             if time.monotonic() >= deadline:
                 return False
-            time.sleep(0.05)  # short sleep for per-second limiting
     def get_info(self) -> Dict:
         with self._lock:
@@ -650,7 +701,6 @@ class ModelProvider(ABC):
         self.instance_id = instance_id
         self.ready = False
         self._lock = threading.Lock()
-        # Per-instance health tracking
         self._consecutive_failures = 0
         self._last_success_time = 0.0
         self._last_failure_time = 0.0
@@ -686,20 +736,16 @@ class ModelProvider(ABC):
     @property
     def health_score(self) -> float:
-        """0.0 (worst) to 1.0 (best). Used by load balancer to pick instance."""
         if not self.ready:
             return 0.0
         score = 1.0
-        # Penalise consecutive failures
         score -= min(self._consecutive_failures * 0.2, 0.8)
-        # Penalise high avg latency (>10s = bad)
         if self._latencies:
             avg = self.avg_latency
             if avg > 10000:
                 score -= 0.3
             elif avg > 5000:
                 score -= 0.15
-        # Penalise high failure rate
         if self._total_requests > 5:
             fail_rate = self._total_failures / self._total_requests
             score -= fail_rate * 0.4
@@ -903,6 +949,7 @@ class GradioClientProvider(ModelProvider):
                     max_new_tokens=max_new,
                     api_name=self.model_def.api_name,
                 )
             elif mid == "command-a-translate":
                 max_new = (max_tokens
                            or self.model_def.extra_params.get("max_new_tokens", 700))
@@ -911,6 +958,20 @@ class GradioClientProvider(ModelProvider):
                     max_new_tokens=max_new,
                     api_name=self.model_def.api_name,
                 )
             elif mid == "minimax-vl-01":
                 temp = (temperature if temperature is not None
                         else self.model_def.default_temperature)
@@ -923,6 +984,7 @@ class GradioClientProvider(ModelProvider):
                     max_tokens=max_tok, temperature=temp, top_p=top_p,
                     api_name=self.model_def.api_name,
                 )
             elif mid == "glm-4.5":
                 sys_p = system_prompt or self.config.default_system_prompt
                 temp = (temperature if temperature is not None
@@ -937,6 +999,7 @@ class GradioClientProvider(ModelProvider):
                     api_name=self.model_def.api_name,
                 )
                 return self._extract_glm(result, include)
             elif mid == "chatgpt":
                 temp = (temperature if temperature is not None
                         else self.model_def.default_temperature)
@@ -954,12 +1017,34 @@ class GradioClientProvider(ModelProvider):
                 )
                 self._chat_counter += 1
                 return ResponseCleaner.extract_chatgpt_text(result)
             elif mid == "qwen3-vl":
                 result = self._client.predict(
                     input_value={"files": None, "text": message},
                     api_name="/add_message",
                 )
                 return ResponseCleaner.extract_qwen_text(result)
             else:
                 raise APIError(f"Unknown model handler: {mid}")
@@ -976,6 +1061,45 @@ class GradioClientProvider(ModelProvider):
         except Exception as e:
             raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
     def _extract_glm(self, result, include_thinking: bool = True) -> str:
         if isinstance(result, tuple) and len(result) >= 1:
             chatbot = result[0]
@@ -996,7 +1120,7 @@ class GradioClientProvider(ModelProvider):
         return ResponseCleaner.clean_glm(str(result), include_thinking)
-# Factory — creates a single provider instance
 def create_provider(model_id: str, config: Config,
                     instance_id: int = 0) -> ModelProvider:
     if model_id not in MODEL_REGISTRY:
@@ -1007,18 +1131,10 @@ def create_provider(model_id: str, config: Config,
     return GradioClientProvider(mdef, config, instance_id)
 # ═══════════════════════════════════════════════════════════════
-#  LOAD BALANCER — Per-model provider pool with health-aware
-#                  round-robin + failover
 # ═══════════════════════════════════════════════════════════════
 class LoadBalancedProviderPool:
-    """
-    Manages multiple provider instances for a single model.
-    Selects the best instance based on health score with
-    weighted-random selection (healthier instances chosen more).
-    Falls back through all instances on failure.
-    """
     def __init__(self, model_id: str, config: Config):
         self.model_id = model_id
         self.config = config
@@ -1041,7 +1157,6 @@ class LoadBalancedProviderPool:
         return len(self._instances)
     def initialize_all(self) -> int:
-        """Initialize all instances, return count of successful ones."""
         ok = 0
         for inst in self._instances:
             try:
@@ -1055,7 +1170,6 @@ class LoadBalancedProviderPool:
         return ok
     def initialize_one(self) -> bool:
-        """Initialize at least one instance."""
         for inst in self._instances:
             try:
                 if inst.initialize():
@@ -1065,30 +1179,21 @@ class LoadBalancedProviderPool:
         return False
     def _select_instance(self) -> ModelProvider:
-        """
-        Select best available instance.
-        Strategy: weighted random by health score.
-        If all have equal scores, falls back to round-robin.
-        """
         if len(self._instances) == 1:
             return self._instances[0]
         with self._lock:
-            # Collect health scores
             scored = []
             for inst in self._instances:
                 score = inst.health_score
-                # Give a minimum weight so unhealthy instances can still recover
                 scored.append((inst, max(score, 0.05)))
             total_weight = sum(s for _, s in scored)
             if total_weight <= 0:
-                # All dead, just round-robin
                 inst = self._instances[self._rr_index % len(self._instances)]
                 self._rr_index += 1
                 return inst
-            # Weighted random selection
             r = random.uniform(0, total_weight)
             cumulative = 0.0
             for inst, weight in scored:
@@ -1096,29 +1201,21 @@ class LoadBalancedProviderPool:
                 if r <= cumulative:
                     return inst
-            # Fallback
             return scored[-1][0]
     def _get_ordered_instances(self) -> List[ModelProvider]:
-        """Return instances ordered by health score (best first)."""
         return sorted(self._instances, key=lambda p: p.health_score, reverse=True)
     def execute(self, fn_name: str, **kwargs) -> Any:
-        """
-        Execute a provider method with automatic failover.
-        Tries the best instance first, fails over to others.
-        """
         primary = self._select_instance()
         metrics.record_lb_dispatch()
-        # Ensure primary is ready
         if not primary.ready:
             try:
                 primary.initialize()
             except Exception:
                 pass
-        # Try primary
         start = time.monotonic()
         try:
             result = self._call_provider(primary, fn_name, **kwargs)
@@ -1132,7 +1229,6 @@ class LoadBalancedProviderPool:
                 f"'{self.model_id}' failed: {primary_err}"
             )
-        # Failover through remaining instances
         for inst in self._get_ordered_instances():
             if inst is primary:
                 continue
@@ -1166,11 +1262,6 @@ class LoadBalancedProviderPool:
         )
     def execute_stream(self, **kwargs) -> Generator[str, None, None]:
-        """
-        Execute streaming with failover.
-        Since generators can't easily be retried mid-stream,
-        we do failover only on initial connection failure.
-        """
         primary = self._select_instance()
         metrics.record_lb_dispatch()
@@ -1180,7 +1271,6 @@ class LoadBalancedProviderPool:
             except Exception:
                 pass
-        # Try primary
         try:
             yield from self._call_provider_stream(primary, **kwargs)
             return
@@ -1191,7 +1281,6 @@ class LoadBalancedProviderPool:
                 f"for '{self.model_id}' failed: {primary_err}"
             )
-        # Failover
         for inst in self._get_ordered_instances():
             if inst is primary:
                 continue
@@ -1242,6 +1331,7 @@ class LoadBalancedProviderPool:
             "model_id": self.model_id,
             "lb_enabled": self.mdef.lb_enabled,
             "pool_size": len(self._instances),
             "instances": [inst.get_instance_info() for inst in self._instances],
         }
@@ -1280,14 +1370,13 @@ class MultiModelClient:
         return self._lb_pools[model_id]
     def _ensure_ready(self, model_id: str) -> LoadBalancedProviderPool:
-        pool = self._get_lb_pool(model_id)
-        # Make sure at least one instance is ready
-        has_ready = any(inst.ready for inst in pool._instances)
         if not has_ready:
-            if not pool.initialize_one():
                 raise APIError(f"Cannot init any instance for {model_id}",
                                "INIT_FAILED")
-        return pool
     @property
     def active_conversation(self) -> Conversation:
@@ -1312,16 +1401,15 @@ class MultiModelClient:
     def init_model(self, model_id: str) -> bool:
         try:
-            pool = self._get_lb_pool(model_id)
-            return pool.initialize_one()
         except Exception:
             return False
     def init_model_all(self, model_id: str) -> int:
-        """Init all instances in the pool, return count of ready ones."""
         try:
-            pool = self._get_lb_pool(model_id)
-            return pool.initialize_all()
         except Exception:
             return 0
@@ -1450,8 +1538,8 @@ class MultiModelClient:
     def get_status(self) -> Dict:
         lb_info = {}
-        for model_id, pool in self._lb_pools.items():
-            lb_info[model_id] = pool.get_pool_info()
         return {
             "version": VERSION,
@@ -1464,7 +1552,7 @@ class MultiModelClient:
         }
 # ═══════════════════════════════════════════════════════════════
-#  SESSION POOL (top-level pool of MultiModelClients)
 # ═══════════════════════════════════════════════════════════════
 class SessionPool:
@@ -1502,10 +1590,14 @@ ALIASES = {
     "cohere-vision": "command-a-vision",
     "command-translate": "command-a-translate",
     "cohere-translate": "command-a-translate", "translate": "command-a-translate",
     "minimax": "minimax-vl-01", "minimax-vl": "minimax-vl-01",
     "glm": "glm-4.5", "glm4": "glm-4.5", "glm-4": "glm-4.5", "zhipu": "glm-4.5",
     "gpt": "chatgpt", "gpt-3.5": "chatgpt", "gpt3": "chatgpt", "openai": "chatgpt",
     "qwen": "qwen3-vl", "qwen3": "qwen3-vl", "qwen-vl": "qwen3-vl",
 }
@@ -1544,6 +1636,7 @@ def index():
         "default_model": config.default_model,
         "features": ["load_balancing", "10_req_per_second_limit", "failover"],
         "models": list(MODEL_REGISTRY.keys()),
         "endpoints": {
             "POST /chat": "Chat with any model",
             "POST /chat/stream": "Streaming chat",
@@ -1568,14 +1661,22 @@ def chat():
     client = pool.acquire()
     if data.get("new_conversation"):
         client.new_conversation(data.get("system_prompt"), model_id)
     result = client.send_message(
         message, model=model_id,
         system_prompt=data.get("system_prompt"),
         temperature=data.get("temperature"),
         max_tokens=data.get("max_tokens"),
         include_thinking=include_thinking,
     )
     thinking, clean = ThinkingParser.split(result)
     resp = {
         "ok": True,
         "response": clean,
@@ -1585,6 +1686,8 @@ def chat():
     }
     if thinking:
         resp["thinking"] = thinking
     return jsonify(resp)
@@ -1602,6 +1705,10 @@ def chat_stream():
     mdef = MODEL_REGISTRY.get(model_id)
     use_stream = mdef.supports_streaming if mdef else False
     def generate():
         try:
             if use_stream:
@@ -1611,6 +1718,7 @@ def chat_stream():
                     temperature=data.get("temperature"),
                     max_tokens=data.get("max_tokens"),
                     include_thinking=include_thinking,
                 ):
                     yield f"data: {json.dumps({'chunk': chunk})}\n\n"
             else:
@@ -1620,6 +1728,7 @@ def chat_stream():
                     temperature=data.get("temperature"),
                     max_tokens=data.get("max_tokens"),
                     include_thinking=include_thinking,
                 )
                 yield f"data: {json.dumps({'chunk': result})}\n\n"
             yield "data: [DONE]\n\n"
@@ -1634,7 +1743,7 @@ def chat_stream():
 def list_models():
     models = []
     for mid, mdef in MODEL_REGISTRY.items():
-        models.append({
             "id": mid,
             "object": "model",
             "owned_by": mdef.owned_by,
@@ -1652,7 +1761,10 @@ def list_models():
                 "enabled": mdef.lb_enabled,
                 "pool_size": mdef.lb_pool_size,
             },
-        })
     return jsonify({"object": "list", "data": models})
@@ -1692,7 +1804,6 @@ def openai_compat():
     client = pool.acquire()
     client.new_conversation(system_prompt, model_id)
-    # Add history from messages
     for msg in messages[:-1]:
         role = msg.get("role")
         content = msg.get("content", "")
@@ -1701,50 +1812,30 @@ def openai_compat():
     mdef = MODEL_REGISTRY[model_id]
     if do_stream:
         def generate():
             try:
-                yield (
-                    f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', "
-                    f"'created': created, 'model': model_id, 'choices': ["
-                    f"{{'index': 0, 'delta': {{'role': 'assistant'}}, "
-                    f"'finish_reason': None}}]})}\n\n"
-                )
                 if mdef.supports_streaming:
                     for chunk in client.send_message(
                         user_msg, stream=True, model=model_id,
                         temperature=temperature, max_tokens=max_tokens,
-                        include_thinking=include_thinking,
                     ):
-                        yield (
-                            f"data: {json.dumps({'id': rid, "
-                            f"'object': 'chat.completion.chunk', "
-                            f"'created': created, 'model': model_id, "
-                            f"'choices': [{{'index': 0, "
-                            f"'delta': {{'content': chunk}}, "
-                            f"'finish_reason': None}}]})}\n\n"
-                        )
                 else:
                     result = client.send_message(
                         user_msg, model=model_id, temperature=temperature,
                         max_tokens=max_tokens,
-                        include_thinking=include_thinking,
-                    )
-                    yield (
-                        f"data: {json.dumps({'id': rid, "
-                        f"'object': 'chat.completion.chunk', "
-                        f"'created': created, 'model': model_id, "
-                        f"'choices': [{{'index': 0, "
-                        f"'delta': {{'content': result}}, "
-                        f"'finish_reason': None}}]})}\n\n"
                     )
-                yield (
-                    f"data: {json.dumps({'id': rid, "
-                    f"'object': 'chat.completion.chunk', "
-                    f"'created': created, 'model': model_id, "
-                    f"'choices': [{{'index': 0, 'delta': {{}}, "
-                    f"'finish_reason': 'stop'}}]})}\n\n"
-                )
                 yield "data: [DONE]\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': {'message': str(e)}})}\n\n"
@@ -1754,7 +1845,7 @@ def openai_compat():
     result = client.send_message(
         user_msg, model=model_id, temperature=temperature,
-        max_tokens=max_tokens, include_thinking=include_thinking,
     )
     return jsonify({
         "id": rid,
@@ -1800,11 +1891,10 @@ def metrics_endpoint():
 @app.route("/lb/status", methods=["GET"])
 def lb_status():
-    """Detailed load balancer status for all models across all clients."""
     all_pools = {}
     for client in pool._clients:
         for model_id, lb_pool in client._lb_pools.items():
-            key = f"{model_id}"
             if key not in all_pools:
                 all_pools[key] = []
             all_pools[key].append(lb_pool.get_pool_info())
@@ -1835,13 +1925,16 @@ def init_model_ep():
         }), 400
     count = pool.init_model(model_id)
     mdef = MODEL_REGISTRY[model_id]
-    return jsonify({
         "ok": True,
         "model": model_id,
         "initialized_instances": count,
         "lb_enabled": mdef.lb_enabled,
         "pool_size_per_client": mdef.lb_pool_size,
-    })
 # ═══════════════════════════════════════════════════════════════
@@ -1854,10 +1947,11 @@ if __name__ == "__main__":
     log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
     log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
     for mid, mdef in MODEL_REGISTRY.items():
-        lb_status_str = (
             f"LB ON (pool={mdef.lb_pool_size})"
             if mdef.lb_enabled
             else "LB OFF (single instance)"
         )
-        log.info(f"  {mid}: {lb_status_str}")
     app.run(host="0.0.0.0", port=port, threaded=True)

 #  CONFIG & CONSTANTS
 # ═══════════════════════════════════════════════════════════════
+VERSION = "2.4.0-hf-lb"
 APP_NAME = "Multi-Model-AI-API"
 DEFAULT_SYSTEM_PROMPT = "You are a helpful, friendly AI assistant."
 DEFAULT_MODEL = "gpt-oss-120b"
     api_name: Optional[str] = None
     extra_params: Dict[str, Any] = field(default_factory=dict)
     clean_analysis: bool = False
+    lb_pool_size: int = 2
+    lb_enabled: bool = True
+    is_beta: bool = False  # Beta flag for experimental models
 MODEL_REGISTRY: Dict[str, ModelDef] = {}
         extra_params={"max_new_tokens": 700},
         lb_pool_size=1, lb_enabled=False,  # NO load balancing for translate
     ))
+    # ── NEW: Command-A Reasoning ──
+    register_model(ModelDef(
+        model_id="command-a-reasoning", display_name="Cohere Command-A Reasoning",
+        provider_type="gradio_client", space_id="CohereLabs/command-a-reasoning",
+        owned_by="cohere", description="Cohere reasoning model with thinking budget",
+        api_name="/chat", supports_vision=False, supports_system_prompt=False,
+        supports_temperature=False, supports_streaming=False, supports_history=False,
+        supports_thinking=True, thinking_default=True, max_tokens_default=4096,
+        extra_params={"thinking_budget": 500},
+        lb_pool_size=2, lb_enabled=True,
+    ))
     register_model(ModelDef(
         model_id="minimax-vl-01", display_name="MiniMax VL-01",
         provider_type="gradio_client", space_id="MiniMaxAI/MiniMax-VL-01",
         supports_thinking=False, max_tokens_default=4096,
         lb_pool_size=2, lb_enabled=True,
     ))
+    # ── NEW: Qwen2.5-Coder (BETA) ──
+    register_model(ModelDef(
+        model_id="qwen2.5-coder", display_name="Qwen2.5-Coder Artifacts (BETA)",
+        provider_type="gradio_client", space_id="Qwen/Qwen2.5-Coder-Artifacts",
+        owned_by="alibaba", description="Alibaba Qwen2.5 Coder — code generation model (BETA)",
+        api_name="/generation_code", supports_vision=False, supports_system_prompt=True,
+        supports_temperature=False, supports_streaming=False, supports_history=False,
+        supports_thinking=False, max_tokens_default=4096,
+        extra_params={
+            "system_prompt_override": (
+                "You are a helpful assistant. You are a skilled programming assistant. "
+                "You help users write, debug, and understand code across all languages. "
+                "Respond with clear explanations and clean code. "
+                "Do NOT generate HTML artifacts or web page previews. "
+                "Do NOT wrap everything in a single HTML file. "
+                "Just provide the code the user asks for with explanations."
+            ),
+        },
+        lb_pool_size=2, lb_enabled=True,
+        is_beta=True,
+    ))
 _init_registry()
     max_retries: int = 3
     retry_backoff_base: float = 1.5
     retry_jitter: float = 0.5
+    rate_limit_rps: int = 10
+    rate_limit_burst: int = 15
     pool_size: int = 2
     max_history_messages: int = 50
     max_message_length: int = 10000
             return str(chatbot).strip() if chatbot else ""
         return str(result)
+    @classmethod
+    def extract_qwen_coder_text(cls, result: Any) -> str:
+        """Extract text from Qwen2.5-Coder /generation_code response.
+        Returns tuple of (markdown, html). We want the markdown part."""
+        if result is None:
+            return ""
+        if isinstance(result, str):
+            return result.strip()
+        if isinstance(result, tuple):
+            # /generation_code returns (markdown_str, html_str)
+            # We want the markdown part (index 0)
+            if len(result) >= 1 and isinstance(result[0], str):
+                text = result[0].strip()
+                if text:
+                    return text
+            # Fallback to second element if first is empty
+            if len(result) >= 2 and isinstance(result[1], str):
+                return result[1].strip()
+        if isinstance(result, (list, dict)):
+            return str(result)
+        return str(result) if result else ""
     @classmethod
     def clean(cls, text: str, model_id: str = "",
               include_thinking: bool = True) -> str:
     requests_per_model: Dict[str, int] = field(default_factory=dict)
     _latencies: deque = field(default_factory=lambda: deque(maxlen=1000), repr=False)
     started_at: float = field(default_factory=time.time)
     lb_total_dispatches: int = 0
     lb_failovers: int = 0
 # ═══════════════════════════════════════════════════════════════
 class RateLimiter:
     def __init__(self, rps: int = 10, burst: int = 15):
+        self.rate = float(rps)
         self.max_tokens = float(burst)
         self.tokens = float(burst)
         self.last_refill = time.monotonic()
                     return True
             if time.monotonic() >= deadline:
                 return False
+            time.sleep(0.05)
     def get_info(self) -> Dict:
         with self._lock:
         self.instance_id = instance_id
         self.ready = False
         self._lock = threading.Lock()
         self._consecutive_failures = 0
         self._last_success_time = 0.0
         self._last_failure_time = 0.0
     @property
     def health_score(self) -> float:
         if not self.ready:
             return 0.0
         score = 1.0
         score -= min(self._consecutive_failures * 0.2, 0.8)
         if self._latencies:
             avg = self.avg_latency
             if avg > 10000:
                 score -= 0.3
             elif avg > 5000:
                 score -= 0.15
         if self._total_requests > 5:
             fail_rate = self._total_failures / self._total_requests
             score -= fail_rate * 0.4
                     max_new_tokens=max_new,
                     api_name=self.model_def.api_name,
                 )
             elif mid == "command-a-translate":
                 max_new = (max_tokens
                            or self.model_def.extra_params.get("max_new_tokens", 700))
                     max_new_tokens=max_new,
                     api_name=self.model_def.api_name,
                 )
+            elif mid == "command-a-reasoning":
+                # Cohere Command-A Reasoning with thinking budget
+                thinking_budget = kw.get(
+                    "thinking_budget",
+                    self.model_def.extra_params.get("thinking_budget", 500),
+                )
+                result = self._client.predict(
+                    message=message,
+                    thinking_budget=thinking_budget,
+                    api_name=self.model_def.api_name,
+                )
+                return self._extract_reasoning(result)
             elif mid == "minimax-vl-01":
                 temp = (temperature if temperature is not None
                         else self.model_def.default_temperature)
                     max_tokens=max_tok, temperature=temp, top_p=top_p,
                     api_name=self.model_def.api_name,
                 )
             elif mid == "glm-4.5":
                 sys_p = system_prompt or self.config.default_system_prompt
                 temp = (temperature if temperature is not None
                     api_name=self.model_def.api_name,
                 )
                 return self._extract_glm(result, include)
             elif mid == "chatgpt":
                 temp = (temperature if temperature is not None
                         else self.model_def.default_temperature)
                 )
                 self._chat_counter += 1
                 return ResponseCleaner.extract_chatgpt_text(result)
             elif mid == "qwen3-vl":
                 result = self._client.predict(
                     input_value={"files": None, "text": message},
                     api_name="/add_message",
                 )
                 return ResponseCleaner.extract_qwen_text(result)
+            elif mid == "qwen2.5-coder":
+                # First set the system prompt to override artifacts behavior
+                sys_override = self.model_def.extra_params.get(
+                    "system_prompt_override", ""
+                )
+                if sys_override:
+                    try:
+                        self._client.predict(
+                            input=sys_override,
+                            api_name="/lambda_1",
+                        )
+                    except Exception as e:
+                        log.warning(f"[qwen2.5-coder] Failed to set system prompt: {e}")
+                result = self._client.predict(
+                    query=message,
+                    api_name="/generation_code",
+                )
+                return ResponseCleaner.extract_qwen_coder_text(result)
             else:
                 raise APIError(f"Unknown model handler: {mid}")
         except Exception as e:
             raise APIError(f"{mid} error: {e}", "PROVIDER_ERROR")
+    def _extract_reasoning(self, result: Any) -> str:
+        """Extract response from Command-A Reasoning.
+        The API returns str | float | bool | list | dict from the Json component."""
+        if result is None:
+            return ""
+        if isinstance(result, str):
+            return result.strip()
+        if isinstance(result, dict):
+            # Try common response keys
+            for key in ("response", "output", "answer", "text", "content", "result"):
+                if key in result:
+                    val = result[key]
+                    if isinstance(val, str):
+                        return val.strip()
+                    return str(val)
+            # Check for thinking + response structure
+            thinking = result.get("thinking", "")
+            response = result.get("response", result.get("output", ""))
+            if thinking and response:
+                return f"<thinking>\n{thinking}\n</thinking>\n{response}"
+            if response:
+                return str(response).strip()
+            # Fallback: serialize entire dict
+            return json.dumps(result, ensure_ascii=False, indent=2)
+        if isinstance(result, (list, tuple)):
+            if len(result) == 1:
+                return str(result[0]).strip()
+            # Try to find text in list elements
+            texts = []
+            for item in result:
+                if isinstance(item, str) and item.strip():
+                    texts.append(item.strip())
+            if texts:
+                return "\n".join(texts)
+            return json.dumps(result, ensure_ascii=False)
+        if isinstance(result, (int, float, bool)):
+            return str(result)
+        return str(result)
     def _extract_glm(self, result, include_thinking: bool = True) -> str:
         if isinstance(result, tuple) and len(result) >= 1:
             chatbot = result[0]
         return ResponseCleaner.clean_glm(str(result), include_thinking)
+# Factory
 def create_provider(model_id: str, config: Config,
                     instance_id: int = 0) -> ModelProvider:
     if model_id not in MODEL_REGISTRY:
     return GradioClientProvider(mdef, config, instance_id)
 # ═══════════════════════════════════════════════════════════════
+#  LOAD BALANCER
 # ═══════════════════════════════════════════════════════════════
 class LoadBalancedProviderPool:
     def __init__(self, model_id: str, config: Config):
         self.model_id = model_id
         self.config = config
         return len(self._instances)
     def initialize_all(self) -> int:
         ok = 0
         for inst in self._instances:
             try:
         return ok
     def initialize_one(self) -> bool:
         for inst in self._instances:
             try:
                 if inst.initialize():
         return False
     def _select_instance(self) -> ModelProvider:
         if len(self._instances) == 1:
             return self._instances[0]
         with self._lock:
             scored = []
             for inst in self._instances:
                 score = inst.health_score
                 scored.append((inst, max(score, 0.05)))
             total_weight = sum(s for _, s in scored)
             if total_weight <= 0:
                 inst = self._instances[self._rr_index % len(self._instances)]
                 self._rr_index += 1
                 return inst
             r = random.uniform(0, total_weight)
             cumulative = 0.0
             for inst, weight in scored:
                 if r <= cumulative:
                     return inst
             return scored[-1][0]
     def _get_ordered_instances(self) -> List[ModelProvider]:
         return sorted(self._instances, key=lambda p: p.health_score, reverse=True)
     def execute(self, fn_name: str, **kwargs) -> Any:
         primary = self._select_instance()
         metrics.record_lb_dispatch()
         if not primary.ready:
             try:
                 primary.initialize()
             except Exception:
                 pass
         start = time.monotonic()
         try:
             result = self._call_provider(primary, fn_name, **kwargs)
                 f"'{self.model_id}' failed: {primary_err}"
             )
         for inst in self._get_ordered_instances():
             if inst is primary:
                 continue
         )
     def execute_stream(self, **kwargs) -> Generator[str, None, None]:
         primary = self._select_instance()
         metrics.record_lb_dispatch()
             except Exception:
                 pass
         try:
             yield from self._call_provider_stream(primary, **kwargs)
             return
                 f"for '{self.model_id}' failed: {primary_err}"
             )
         for inst in self._get_ordered_instances():
             if inst is primary:
                 continue
             "model_id": self.model_id,
             "lb_enabled": self.mdef.lb_enabled,
             "pool_size": len(self._instances),
+            "is_beta": self.mdef.is_beta,
             "instances": [inst.get_instance_info() for inst in self._instances],
         }
         return self._lb_pools[model_id]
     def _ensure_ready(self, model_id: str) -> LoadBalancedProviderPool:
+        lb_pool = self._get_lb_pool(model_id)
+        has_ready = any(inst.ready for inst in lb_pool._instances)
         if not has_ready:
+            if not lb_pool.initialize_one():
                 raise APIError(f"Cannot init any instance for {model_id}",
                                "INIT_FAILED")
+        return lb_pool
     @property
     def active_conversation(self) -> Conversation:
     def init_model(self, model_id: str) -> bool:
         try:
+            lb_pool = self._get_lb_pool(model_id)
+            return lb_pool.initialize_one()
         except Exception:
             return False
     def init_model_all(self, model_id: str) -> int:
         try:
+            lb_pool = self._get_lb_pool(model_id)
+            return lb_pool.initialize_all()
         except Exception:
             return 0
     def get_status(self) -> Dict:
         lb_info = {}
+        for model_id, lb_pool in self._lb_pools.items():
+            lb_info[model_id] = lb_pool.get_pool_info()
         return {
             "version": VERSION,
         }
 # ═══════════════════════════════════════════════════════════════
+#  SESSION POOL
 # ═══════════════════════════════════════════════════════════════
 class SessionPool:
     "cohere-vision": "command-a-vision",
     "command-translate": "command-a-translate",
     "cohere-translate": "command-a-translate", "translate": "command-a-translate",
+    "command-reasoning": "command-a-reasoning", "reasoning": "command-a-reasoning",
+    "cohere-reasoning": "command-a-reasoning", "command-r": "command-a-reasoning",
     "minimax": "minimax-vl-01", "minimax-vl": "minimax-vl-01",
     "glm": "glm-4.5", "glm4": "glm-4.5", "glm-4": "glm-4.5", "zhipu": "glm-4.5",
     "gpt": "chatgpt", "gpt-3.5": "chatgpt", "gpt3": "chatgpt", "openai": "chatgpt",
     "qwen": "qwen3-vl", "qwen3": "qwen3-vl", "qwen-vl": "qwen3-vl",
+    "qwen-coder": "qwen2.5-coder", "qwen2.5": "qwen2.5-coder",
+    "qwen25-coder": "qwen2.5-coder", "coder": "qwen2.5-coder",
 }
         "default_model": config.default_model,
         "features": ["load_balancing", "10_req_per_second_limit", "failover"],
         "models": list(MODEL_REGISTRY.keys()),
+        "beta_models": [mid for mid, mdef in MODEL_REGISTRY.items() if mdef.is_beta],
         "endpoints": {
             "POST /chat": "Chat with any model",
             "POST /chat/stream": "Streaming chat",
     client = pool.acquire()
     if data.get("new_conversation"):
         client.new_conversation(data.get("system_prompt"), model_id)
+    # Pass extra params for specific models
+    extra = {}
+    if model_id == "command-a-reasoning" and "thinking_budget" in data:
+        extra["thinking_budget"] = data["thinking_budget"]
     result = client.send_message(
         message, model=model_id,
         system_prompt=data.get("system_prompt"),
         temperature=data.get("temperature"),
         max_tokens=data.get("max_tokens"),
         include_thinking=include_thinking,
+        **extra,
     )
     thinking, clean = ThinkingParser.split(result)
+    mdef = MODEL_REGISTRY.get(model_id)
     resp = {
         "ok": True,
         "response": clean,
     }
     if thinking:
         resp["thinking"] = thinking
+    if mdef and mdef.is_beta:
+        resp["beta"] = True
     return jsonify(resp)
     mdef = MODEL_REGISTRY.get(model_id)
     use_stream = mdef.supports_streaming if mdef else False
+    extra = {}
+    if model_id == "command-a-reasoning" and "thinking_budget" in data:
+        extra["thinking_budget"] = data["thinking_budget"]
     def generate():
         try:
             if use_stream:
                     temperature=data.get("temperature"),
                     max_tokens=data.get("max_tokens"),
                     include_thinking=include_thinking,
+                    **extra,
                 ):
                     yield f"data: {json.dumps({'chunk': chunk})}\n\n"
             else:
                     temperature=data.get("temperature"),
                     max_tokens=data.get("max_tokens"),
                     include_thinking=include_thinking,
+                    **extra,
                 )
                 yield f"data: {json.dumps({'chunk': result})}\n\n"
             yield "data: [DONE]\n\n"
 def list_models():
     models = []
     for mid, mdef in MODEL_REGISTRY.items():
+        model_info = {
             "id": mid,
             "object": "model",
             "owned_by": mdef.owned_by,
                 "enabled": mdef.lb_enabled,
                 "pool_size": mdef.lb_pool_size,
             },
+        }
+        if mdef.is_beta:
+            model_info["beta"] = True
+        models.append(model_info)
     return jsonify({"object": "list", "data": models})
     client = pool.acquire()
     client.new_conversation(system_prompt, model_id)
     for msg in messages[:-1]:
         role = msg.get("role")
         content = msg.get("content", "")
     mdef = MODEL_REGISTRY[model_id]
+    # Extra params
+    extra = {}
+    if model_id == "command-a-reasoning" and "thinking_budget" in data:
+        extra["thinking_budget"] = data["thinking_budget"]
     if do_stream:
         def generate():
             try:
+                yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'role': 'assistant'}, 'finish_reason': None}]})}\n\n"
                 if mdef.supports_streaming:
                     for chunk in client.send_message(
                         user_msg, stream=True, model=model_id,
                         temperature=temperature, max_tokens=max_tokens,
+                        include_thinking=include_thinking, **extra,
                     ):
+                        yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]})}\n\n"
                 else:
                     result = client.send_message(
                         user_msg, model=model_id, temperature=temperature,
                         max_tokens=max_tokens,
+                        include_thinking=include_thinking, **extra,
                     )
+                    yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {'content': result}, 'finish_reason': None}]})}\n\n"
+                yield f"data: {json.dumps({'id': rid, 'object': 'chat.completion.chunk', 'created': created, 'model': model_id, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
                 yield "data: [DONE]\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': {'message': str(e)}})}\n\n"
     result = client.send_message(
         user_msg, model=model_id, temperature=temperature,
+        max_tokens=max_tokens, include_thinking=include_thinking, **extra,
     )
     return jsonify({
         "id": rid,
 @app.route("/lb/status", methods=["GET"])
 def lb_status():
     all_pools = {}
     for client in pool._clients:
         for model_id, lb_pool in client._lb_pools.items():
+            key = model_id
             if key not in all_pools:
                 all_pools[key] = []
             all_pools[key].append(lb_pool.get_pool_info())
         }), 400
     count = pool.init_model(model_id)
     mdef = MODEL_REGISTRY[model_id]
+    resp = {
         "ok": True,
         "model": model_id,
         "initialized_instances": count,
         "lb_enabled": mdef.lb_enabled,
         "pool_size_per_client": mdef.lb_pool_size,
+    }
+    if mdef.is_beta:
+        resp["beta"] = True
+    return jsonify(resp)
 # ═══════════════════════════════════════════════════════════════
     log.info(f"Models: {list(MODEL_REGISTRY.keys())}")
     log.info(f"Rate limit: {config.rate_limit_rps} req/s (burst: {config.rate_limit_burst})")
     for mid, mdef in MODEL_REGISTRY.items():
+        lb_str = (
             f"LB ON (pool={mdef.lb_pool_size})"
             if mdef.lb_enabled
             else "LB OFF (single instance)"
         )
+        beta_str = " [BETA]" if mdef.is_beta else ""
+        log.info(f"  {mid}: {lb_str}{beta_str}")
     app.run(host="0.0.0.0", port=port, threaded=True)