Spaces:

Yash030
/

claude-code-proxy

Running

Yash030 Claude Opus 4.7 commited on 4 days ago

Commit

a5ea640

1 Parent(s): fcc5278

Speed up NIM provider with failure tracking and faster timeouts

- Add ModelHealthTracker for per-model failure tracking (30s TTL)
- Faster timeouts: connect=10s, first_chunk=30s, fallback=20s
- Auto model pre-checks health before attempting requests
- Record failures on timeout/rate-limit for smart fallback
- Skip unhealthy models in both router and services

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (15) hide show

.claude/settings.local.json +5 -1
api/__pycache__/detection.cpython-314.pyc +0 -0
api/__pycache__/model_router.cpython-314.pyc +0 -0
api/__pycache__/optimization_handlers.cpython-314.pyc +0 -0
api/__pycache__/routes.cpython-314.pyc +0 -0
api/__pycache__/runtime.cpython-314.pyc +0 -0
api/__pycache__/services.cpython-314.pyc +0 -0
api/model_router.py +8 -2
api/services.py +14 -2
core/__pycache__/session_tracker.cpython-314.pyc +0 -0
providers/__pycache__/openai_compat.cpython-314.pyc +0 -0
providers/__pycache__/rate_limit.cpython-314.pyc +0 -0
providers/nvidia_nim/__pycache__/client.cpython-314.pyc +0 -0
providers/nvidia_nim/client.py +4 -4
providers/rate_limit.py +79 -0

.claude/settings.local.json CHANGED Viewed

@@ -10,7 +10,11 @@
       "Bash(git push *)",
       "Bash(python -c \"import ast; ast.parse\\(open\\('api/services.py'\\).read\\(\\)\\); print\\('Syntax OK'\\)\")",
       "mcp__github__list_issues",
-      "mcp__github__update_issue"
     ]
   },
   "enableAllProjectMcpServers": true,

       "Bash(git push *)",
       "Bash(python -c \"import ast; ast.parse\\(open\\('api/services.py'\\).read\\(\\)\\); print\\('Syntax OK'\\)\")",
       "mcp__github__list_issues",
+      "mcp__github__update_issue",
+      "Bash(git commit *)",
+      "Bash(dir \"C:\\\\Users\\\\yashw\\\\.gemini\\\\antigravity\\\\llm_wiki_v2\\\\wiki\\\\entities\")",
+      "Bash(dir *)",
+      "Bash(node -e ' *)"
     ]
   },
   "enableAllProjectMcpServers": true,

api/__pycache__/detection.cpython-314.pyc CHANGED Viewed

Binary files a/api/__pycache__/detection.cpython-314.pyc and b/api/__pycache__/detection.cpython-314.pyc differ

api/__pycache__/model_router.cpython-314.pyc CHANGED Viewed

Binary files a/api/__pycache__/model_router.cpython-314.pyc and b/api/__pycache__/model_router.cpython-314.pyc differ

api/__pycache__/optimization_handlers.cpython-314.pyc CHANGED Viewed

Binary files a/api/__pycache__/optimization_handlers.cpython-314.pyc and b/api/__pycache__/optimization_handlers.cpython-314.pyc differ

api/__pycache__/routes.cpython-314.pyc CHANGED Viewed

Binary files a/api/__pycache__/routes.cpython-314.pyc and b/api/__pycache__/routes.cpython-314.pyc differ

api/__pycache__/runtime.cpython-314.pyc CHANGED Viewed

Binary files a/api/__pycache__/runtime.cpython-314.pyc and b/api/__pycache__/runtime.cpython-314.pyc differ

api/__pycache__/services.cpython-314.pyc CHANGED Viewed

Binary files a/api/__pycache__/services.cpython-314.pyc and b/api/__pycache__/services.cpython-314.pyc differ

api/model_router.py CHANGED Viewed

@@ -196,11 +196,17 @@ class ModelRouter:
                 if provider_id == "zen":
                     is_blocked = False
-                if is_blocked:
                     logger.debug(
-                        "Routing: candidate '{}' (from {}) is BLOCKED",
                         normalized_ref,
                         source,
                     )
                     blocked_candidates.append(resolved)
                 else:

                 if provider_id == "zen":
                     is_blocked = False
+                # Check model health (recent failures)
+                is_healthy = limiter.is_healthy(normalized_ref)
+                if is_blocked or not is_healthy:
+                    reason = "BLOCKED" if is_blocked else "UNHEALTHY"
                     logger.debug(
+                        "Routing: candidate '{}' (from {}) is {} (health={})",
                         normalized_ref,
                         source,
+                        reason,
+                        is_healthy,
                     )
                     blocked_candidates.append(resolved)
                 else:

api/services.py CHANGED Viewed

@@ -244,18 +244,27 @@ class ClaudeProxyService:
         for i, resolved in enumerate(candidates):
             try:
-                # Pre-check: skip candidates that are currently rate limited
                 from providers.rate_limit import GlobalRateLimiter
                 limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
                 if limiter.is_blocked() and resolved.provider_id != "zen":
                     logger.warning(
-                        "Provider '{} is currently rate limited, skipping to next candidate...",
                         resolved.provider_id,
                     )
                     last_exc = Exception("Rate limited")
                     continue
                 provider = self._provider_getter(resolved.provider_id)
                 routed_request = request_data.model_copy(deep=True)
                 routed_request.model = resolved.provider_model
@@ -302,6 +311,7 @@ class ClaudeProxyService:
                     resolved.provider_id,
                     e.status_code,
                 )
                 last_exc = e
                 continue
             except TimeoutError as e:
@@ -311,6 +321,7 @@ class ClaudeProxyService:
                     resolved.provider_id,
                     type(e).__name__,
                 )
                 last_exc = e
                 continue
             except Exception as e:
@@ -334,6 +345,7 @@ class ClaudeProxyService:
                         type(e).__name__,
                         e,
                     )
                     last_exc = e
                     continue

         for i, resolved in enumerate(candidates):
             try:
+                # Pre-check: skip candidates that are currently rate limited or unhealthy
                 from providers.rate_limit import GlobalRateLimiter
                 limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
                 if limiter.is_blocked() and resolved.provider_id != "zen":
                     logger.warning(
+                        "Provider '{}' is currently rate limited, skipping to next candidate...",
                         resolved.provider_id,
                     )
                     last_exc = Exception("Rate limited")
                     continue
+                # Check model health (recent failures)
+                if not limiter.is_healthy(resolved.provider_model_ref):
+                    logger.warning(
+                        "Provider '{}' has recent failures, skipping to next candidate...",
+                        resolved.provider_model_ref,
+                    )
+                    last_exc = Exception("Recent failures")
+                    continue
                 provider = self._provider_getter(resolved.provider_id)
                 routed_request = request_data.model_copy(deep=True)
                 routed_request.model = resolved.provider_model
                     resolved.provider_id,
                     e.status_code,
                 )
+                limiter.record_failure(resolved.provider_model_ref)
                 last_exc = e
                 continue
             except TimeoutError as e:
                     resolved.provider_id,
                     type(e).__name__,
                 )
+                limiter.record_failure(resolved.provider_model_ref)
                 last_exc = e
                 continue
             except Exception as e:
                         type(e).__name__,
                         e,
                     )
+                    limiter.record_failure(resolved.provider_model_ref)
                     last_exc = e
                     continue

core/__pycache__/session_tracker.cpython-314.pyc CHANGED Viewed

Binary files a/core/__pycache__/session_tracker.cpython-314.pyc and b/core/__pycache__/session_tracker.cpython-314.pyc differ

providers/__pycache__/openai_compat.cpython-314.pyc CHANGED Viewed

Binary files a/providers/__pycache__/openai_compat.cpython-314.pyc and b/providers/__pycache__/openai_compat.cpython-314.pyc differ

providers/__pycache__/rate_limit.cpython-314.pyc CHANGED Viewed

Binary files a/providers/__pycache__/rate_limit.cpython-314.pyc and b/providers/__pycache__/rate_limit.cpython-314.pyc differ

providers/nvidia_nim/__pycache__/client.cpython-314.pyc CHANGED Viewed

Binary files a/providers/nvidia_nim/__pycache__/client.cpython-314.pyc and b/providers/nvidia_nim/__pycache__/client.cpython-314.pyc differ

providers/nvidia_nim/client.py CHANGED Viewed

@@ -108,10 +108,10 @@ class NvidiaNimProvider(OpenAIChatTransport):
         """
         from config.settings import get_settings
-        # Reduced timeouts for faster fallback detection
-        connect_timeout_s = 15  # Reduced from 30
-        first_chunk_timeout_s = 45  # Reduced from 60
-        fallback_first_chunk_timeout_s = 30  # Reduced from 60 - faster fallback
         try:
             client = self._client_for_body(body)

         """
         from config.settings import get_settings
+        # Faster timeouts for quick failover detection
+        connect_timeout_s = 10  # Reduced from 15
+        first_chunk_timeout_s = 30  # Reduced from 45
+        fallback_first_chunk_timeout_s = 20  # Reduced from 30
         try:
             client = self._client_for_body(body)

providers/rate_limit.py CHANGED Viewed

@@ -16,6 +16,60 @@ from core.rate_limit import StrictSlidingWindowLimiter
 T = TypeVar("T")
 class GlobalRateLimiter:
     """
     Global singleton rate limiter that blocks all requests
@@ -181,6 +235,31 @@ class GlobalRateLimiter:
         """Get remaining reactive wait time in seconds."""
         return max(0.0, self._blocked_until - time.monotonic())
     @asynccontextmanager
     async def concurrency_slot(self) -> AsyncIterator[None]:
         """Async context manager that holds one concurrency slot for a stream.

 T = TypeVar("T")
+class ModelHealthTracker:
+    """Track per-model health based on recent failures."""
+    _instance: ClassVar["ModelHealthTracker | None"] = None
+    def __init__(self, failure_ttl: float = 30.0, max_failures: int = 3) -> None:
+        self._failure_ttl = failure_ttl
+        self._max_failures = max_failures
+        self._failures: dict[str, list[float]] = {}
+    @classmethod
+    def get_instance(cls) -> "ModelHealthTracker":
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+    def record_failure(self, model_ref: str) -> None:
+        """Record a failure timestamp for a model."""
+        now = time.monotonic()
+        if model_ref not in self._failures:
+            self._failures[model_ref] = []
+        self._failures[model_ref].append(now)
+        logger.debug("HEALTH: recorded failure for '{}'", model_ref)
+    def is_healthy(self, model_ref: str) -> bool:
+        """Check if model has had fewer than max_failures in the TTL window."""
+        if model_ref not in self._failures:
+            return True
+        cutoff = time.monotonic() - self._failure_ttl
+        recent = [t for t in self._failures[model_ref] if t > cutoff]
+        self._failures[model_ref] = recent
+        healthy = len(recent) < self._max_failures
+        if not healthy:
+            logger.debug(
+                "HEALTH: model '{}' is unhealthy ({} failures in {}s)",
+                model_ref,
+                len(recent),
+                self._failure_ttl,
+            )
+        return healthy
+    def get_failure_count(self, model_ref: str) -> int:
+        """Get number of recent failures for a model."""
+        if model_ref not in self._failures:
+            return 0
+        cutoff = time.monotonic() - self._failure_ttl
+        return len([t for t in self._failures[model_ref] if t > cutoff])
+    def clear_failures(self, model_ref: str) -> None:
+        """Clear failure history for a model (on success)."""
+        if model_ref in self._failures:
+            self._failures.pop(model_ref)
 class GlobalRateLimiter:
     """
     Global singleton rate limiter that blocks all requests
         """Get remaining reactive wait time in seconds."""
         return max(0.0, self._blocked_until - time.monotonic())
+    def record_failure(self, model_ref: str | None = None) -> None:
+        """Record a failure for rate limit tracking.
+        Args:
+            model_ref: Optional model identifier for health tracking.
+        """
+        # Record in the shared health tracker if model provided
+        if model_ref:
+            health = ModelHealthTracker.get_instance()
+            health.record_failure(model_ref)
+    def is_healthy(self, model_ref: str | None = None) -> bool:
+        """Check if provider/model is healthy based on failure history.
+        Args:
+            model_ref: Optional model identifier for health tracking.
+        Returns:
+            True if no recent failures or model_ref is None.
+        """
+        if model_ref is None:
+            return True
+        health = ModelHealthTracker.get_instance()
+        return health.is_healthy(model_ref)
     @asynccontextmanager
     async def concurrency_slot(self) -> AsyncIterator[None]:
         """Async context manager that holds one concurrency slot for a stream.