Spaces:
Running
Running
Speed up NIM provider with failure tracking and faster timeouts
Browse files- Add ModelHealthTracker for per-model failure tracking (30s TTL)
- Faster timeouts: connect=10s, first_chunk=30s, fallback=20s
- Auto model pre-checks health before attempting requests
- Record failures on timeout/rate-limit for smart fallback
- Skip unhealthy models in both router and services
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- .claude/settings.local.json +5 -1
- api/__pycache__/detection.cpython-314.pyc +0 -0
- api/__pycache__/model_router.cpython-314.pyc +0 -0
- api/__pycache__/optimization_handlers.cpython-314.pyc +0 -0
- api/__pycache__/routes.cpython-314.pyc +0 -0
- api/__pycache__/runtime.cpython-314.pyc +0 -0
- api/__pycache__/services.cpython-314.pyc +0 -0
- api/model_router.py +8 -2
- api/services.py +14 -2
- core/__pycache__/session_tracker.cpython-314.pyc +0 -0
- providers/__pycache__/openai_compat.cpython-314.pyc +0 -0
- providers/__pycache__/rate_limit.cpython-314.pyc +0 -0
- providers/nvidia_nim/__pycache__/client.cpython-314.pyc +0 -0
- providers/nvidia_nim/client.py +4 -4
- providers/rate_limit.py +79 -0
.claude/settings.local.json
CHANGED
|
@@ -10,7 +10,11 @@
|
|
| 10 |
"Bash(git push *)",
|
| 11 |
"Bash(python -c \"import ast; ast.parse\\(open\\('api/services.py'\\).read\\(\\)\\); print\\('Syntax OK'\\)\")",
|
| 12 |
"mcp__github__list_issues",
|
| 13 |
-
"mcp__github__update_issue"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
]
|
| 15 |
},
|
| 16 |
"enableAllProjectMcpServers": true,
|
|
|
|
| 10 |
"Bash(git push *)",
|
| 11 |
"Bash(python -c \"import ast; ast.parse\\(open\\('api/services.py'\\).read\\(\\)\\); print\\('Syntax OK'\\)\")",
|
| 12 |
"mcp__github__list_issues",
|
| 13 |
+
"mcp__github__update_issue",
|
| 14 |
+
"Bash(git commit *)",
|
| 15 |
+
"Bash(dir \"C:\\\\Users\\\\yashw\\\\.gemini\\\\antigravity\\\\llm_wiki_v2\\\\wiki\\\\entities\")",
|
| 16 |
+
"Bash(dir *)",
|
| 17 |
+
"Bash(node -e ' *)"
|
| 18 |
]
|
| 19 |
},
|
| 20 |
"enableAllProjectMcpServers": true,
|
api/__pycache__/detection.cpython-314.pyc
CHANGED
|
Binary files a/api/__pycache__/detection.cpython-314.pyc and b/api/__pycache__/detection.cpython-314.pyc differ
|
|
|
api/__pycache__/model_router.cpython-314.pyc
CHANGED
|
Binary files a/api/__pycache__/model_router.cpython-314.pyc and b/api/__pycache__/model_router.cpython-314.pyc differ
|
|
|
api/__pycache__/optimization_handlers.cpython-314.pyc
CHANGED
|
Binary files a/api/__pycache__/optimization_handlers.cpython-314.pyc and b/api/__pycache__/optimization_handlers.cpython-314.pyc differ
|
|
|
api/__pycache__/routes.cpython-314.pyc
CHANGED
|
Binary files a/api/__pycache__/routes.cpython-314.pyc and b/api/__pycache__/routes.cpython-314.pyc differ
|
|
|
api/__pycache__/runtime.cpython-314.pyc
CHANGED
|
Binary files a/api/__pycache__/runtime.cpython-314.pyc and b/api/__pycache__/runtime.cpython-314.pyc differ
|
|
|
api/__pycache__/services.cpython-314.pyc
CHANGED
|
Binary files a/api/__pycache__/services.cpython-314.pyc and b/api/__pycache__/services.cpython-314.pyc differ
|
|
|
api/model_router.py
CHANGED
|
@@ -196,11 +196,17 @@ class ModelRouter:
|
|
| 196 |
if provider_id == "zen":
|
| 197 |
is_blocked = False
|
| 198 |
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
logger.debug(
|
| 201 |
-
"Routing: candidate '{}' (from {}) is
|
| 202 |
normalized_ref,
|
| 203 |
source,
|
|
|
|
|
|
|
| 204 |
)
|
| 205 |
blocked_candidates.append(resolved)
|
| 206 |
else:
|
|
|
|
| 196 |
if provider_id == "zen":
|
| 197 |
is_blocked = False
|
| 198 |
|
| 199 |
+
# Check model health (recent failures)
|
| 200 |
+
is_healthy = limiter.is_healthy(normalized_ref)
|
| 201 |
+
|
| 202 |
+
if is_blocked or not is_healthy:
|
| 203 |
+
reason = "BLOCKED" if is_blocked else "UNHEALTHY"
|
| 204 |
logger.debug(
|
| 205 |
+
"Routing: candidate '{}' (from {}) is {} (health={})",
|
| 206 |
normalized_ref,
|
| 207 |
source,
|
| 208 |
+
reason,
|
| 209 |
+
is_healthy,
|
| 210 |
)
|
| 211 |
blocked_candidates.append(resolved)
|
| 212 |
else:
|
api/services.py
CHANGED
|
@@ -244,18 +244,27 @@ class ClaudeProxyService:
|
|
| 244 |
|
| 245 |
for i, resolved in enumerate(candidates):
|
| 246 |
try:
|
| 247 |
-
# Pre-check: skip candidates that are currently rate limited
|
| 248 |
from providers.rate_limit import GlobalRateLimiter
|
| 249 |
|
| 250 |
limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
|
| 251 |
if limiter.is_blocked() and resolved.provider_id != "zen":
|
| 252 |
logger.warning(
|
| 253 |
-
"Provider '{} is currently rate limited, skipping to next candidate...",
|
| 254 |
resolved.provider_id,
|
| 255 |
)
|
| 256 |
last_exc = Exception("Rate limited")
|
| 257 |
continue
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
provider = self._provider_getter(resolved.provider_id)
|
| 260 |
routed_request = request_data.model_copy(deep=True)
|
| 261 |
routed_request.model = resolved.provider_model
|
|
@@ -302,6 +311,7 @@ class ClaudeProxyService:
|
|
| 302 |
resolved.provider_id,
|
| 303 |
e.status_code,
|
| 304 |
)
|
|
|
|
| 305 |
last_exc = e
|
| 306 |
continue
|
| 307 |
except TimeoutError as e:
|
|
@@ -311,6 +321,7 @@ class ClaudeProxyService:
|
|
| 311 |
resolved.provider_id,
|
| 312 |
type(e).__name__,
|
| 313 |
)
|
|
|
|
| 314 |
last_exc = e
|
| 315 |
continue
|
| 316 |
except Exception as e:
|
|
@@ -334,6 +345,7 @@ class ClaudeProxyService:
|
|
| 334 |
type(e).__name__,
|
| 335 |
e,
|
| 336 |
)
|
|
|
|
| 337 |
last_exc = e
|
| 338 |
continue
|
| 339 |
|
|
|
|
| 244 |
|
| 245 |
for i, resolved in enumerate(candidates):
|
| 246 |
try:
|
| 247 |
+
# Pre-check: skip candidates that are currently rate limited or unhealthy
|
| 248 |
from providers.rate_limit import GlobalRateLimiter
|
| 249 |
|
| 250 |
limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
|
| 251 |
if limiter.is_blocked() and resolved.provider_id != "zen":
|
| 252 |
logger.warning(
|
| 253 |
+
"Provider '{}' is currently rate limited, skipping to next candidate...",
|
| 254 |
resolved.provider_id,
|
| 255 |
)
|
| 256 |
last_exc = Exception("Rate limited")
|
| 257 |
continue
|
| 258 |
|
| 259 |
+
# Check model health (recent failures)
|
| 260 |
+
if not limiter.is_healthy(resolved.provider_model_ref):
|
| 261 |
+
logger.warning(
|
| 262 |
+
"Provider '{}' has recent failures, skipping to next candidate...",
|
| 263 |
+
resolved.provider_model_ref,
|
| 264 |
+
)
|
| 265 |
+
last_exc = Exception("Recent failures")
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
provider = self._provider_getter(resolved.provider_id)
|
| 269 |
routed_request = request_data.model_copy(deep=True)
|
| 270 |
routed_request.model = resolved.provider_model
|
|
|
|
| 311 |
resolved.provider_id,
|
| 312 |
e.status_code,
|
| 313 |
)
|
| 314 |
+
limiter.record_failure(resolved.provider_model_ref)
|
| 315 |
last_exc = e
|
| 316 |
continue
|
| 317 |
except TimeoutError as e:
|
|
|
|
| 321 |
resolved.provider_id,
|
| 322 |
type(e).__name__,
|
| 323 |
)
|
| 324 |
+
limiter.record_failure(resolved.provider_model_ref)
|
| 325 |
last_exc = e
|
| 326 |
continue
|
| 327 |
except Exception as e:
|
|
|
|
| 345 |
type(e).__name__,
|
| 346 |
e,
|
| 347 |
)
|
| 348 |
+
limiter.record_failure(resolved.provider_model_ref)
|
| 349 |
last_exc = e
|
| 350 |
continue
|
| 351 |
|
core/__pycache__/session_tracker.cpython-314.pyc
CHANGED
|
Binary files a/core/__pycache__/session_tracker.cpython-314.pyc and b/core/__pycache__/session_tracker.cpython-314.pyc differ
|
|
|
providers/__pycache__/openai_compat.cpython-314.pyc
CHANGED
|
Binary files a/providers/__pycache__/openai_compat.cpython-314.pyc and b/providers/__pycache__/openai_compat.cpython-314.pyc differ
|
|
|
providers/__pycache__/rate_limit.cpython-314.pyc
CHANGED
|
Binary files a/providers/__pycache__/rate_limit.cpython-314.pyc and b/providers/__pycache__/rate_limit.cpython-314.pyc differ
|
|
|
providers/nvidia_nim/__pycache__/client.cpython-314.pyc
CHANGED
|
Binary files a/providers/nvidia_nim/__pycache__/client.cpython-314.pyc and b/providers/nvidia_nim/__pycache__/client.cpython-314.pyc differ
|
|
|
providers/nvidia_nim/client.py
CHANGED
|
@@ -108,10 +108,10 @@ class NvidiaNimProvider(OpenAIChatTransport):
|
|
| 108 |
"""
|
| 109 |
from config.settings import get_settings
|
| 110 |
|
| 111 |
-
#
|
| 112 |
-
connect_timeout_s =
|
| 113 |
-
first_chunk_timeout_s =
|
| 114 |
-
fallback_first_chunk_timeout_s =
|
| 115 |
|
| 116 |
try:
|
| 117 |
client = self._client_for_body(body)
|
|
|
|
| 108 |
"""
|
| 109 |
from config.settings import get_settings
|
| 110 |
|
| 111 |
+
# Faster timeouts for quick failover detection
|
| 112 |
+
connect_timeout_s = 10 # Reduced from 15
|
| 113 |
+
first_chunk_timeout_s = 30 # Reduced from 45
|
| 114 |
+
fallback_first_chunk_timeout_s = 20 # Reduced from 30
|
| 115 |
|
| 116 |
try:
|
| 117 |
client = self._client_for_body(body)
|
providers/rate_limit.py
CHANGED
|
@@ -16,6 +16,60 @@ from core.rate_limit import StrictSlidingWindowLimiter
|
|
| 16 |
T = TypeVar("T")
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
class GlobalRateLimiter:
|
| 20 |
"""
|
| 21 |
Global singleton rate limiter that blocks all requests
|
|
@@ -181,6 +235,31 @@ class GlobalRateLimiter:
|
|
| 181 |
"""Get remaining reactive wait time in seconds."""
|
| 182 |
return max(0.0, self._blocked_until - time.monotonic())
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
@asynccontextmanager
|
| 185 |
async def concurrency_slot(self) -> AsyncIterator[None]:
|
| 186 |
"""Async context manager that holds one concurrency slot for a stream.
|
|
|
|
| 16 |
T = TypeVar("T")
|
| 17 |
|
| 18 |
|
| 19 |
+
class ModelHealthTracker:
|
| 20 |
+
"""Track per-model health based on recent failures."""
|
| 21 |
+
|
| 22 |
+
_instance: ClassVar["ModelHealthTracker | None"] = None
|
| 23 |
+
|
| 24 |
+
def __init__(self, failure_ttl: float = 30.0, max_failures: int = 3) -> None:
|
| 25 |
+
self._failure_ttl = failure_ttl
|
| 26 |
+
self._max_failures = max_failures
|
| 27 |
+
self._failures: dict[str, list[float]] = {}
|
| 28 |
+
|
| 29 |
+
@classmethod
|
| 30 |
+
def get_instance(cls) -> "ModelHealthTracker":
|
| 31 |
+
if cls._instance is None:
|
| 32 |
+
cls._instance = cls()
|
| 33 |
+
return cls._instance
|
| 34 |
+
|
| 35 |
+
def record_failure(self, model_ref: str) -> None:
|
| 36 |
+
"""Record a failure timestamp for a model."""
|
| 37 |
+
now = time.monotonic()
|
| 38 |
+
if model_ref not in self._failures:
|
| 39 |
+
self._failures[model_ref] = []
|
| 40 |
+
self._failures[model_ref].append(now)
|
| 41 |
+
logger.debug("HEALTH: recorded failure for '{}'", model_ref)
|
| 42 |
+
|
| 43 |
+
def is_healthy(self, model_ref: str) -> bool:
|
| 44 |
+
"""Check if model has had fewer than max_failures in the TTL window."""
|
| 45 |
+
if model_ref not in self._failures:
|
| 46 |
+
return True
|
| 47 |
+
cutoff = time.monotonic() - self._failure_ttl
|
| 48 |
+
recent = [t for t in self._failures[model_ref] if t > cutoff]
|
| 49 |
+
self._failures[model_ref] = recent
|
| 50 |
+
healthy = len(recent) < self._max_failures
|
| 51 |
+
if not healthy:
|
| 52 |
+
logger.debug(
|
| 53 |
+
"HEALTH: model '{}' is unhealthy ({} failures in {}s)",
|
| 54 |
+
model_ref,
|
| 55 |
+
len(recent),
|
| 56 |
+
self._failure_ttl,
|
| 57 |
+
)
|
| 58 |
+
return healthy
|
| 59 |
+
|
| 60 |
+
def get_failure_count(self, model_ref: str) -> int:
|
| 61 |
+
"""Get number of recent failures for a model."""
|
| 62 |
+
if model_ref not in self._failures:
|
| 63 |
+
return 0
|
| 64 |
+
cutoff = time.monotonic() - self._failure_ttl
|
| 65 |
+
return len([t for t in self._failures[model_ref] if t > cutoff])
|
| 66 |
+
|
| 67 |
+
def clear_failures(self, model_ref: str) -> None:
|
| 68 |
+
"""Clear failure history for a model (on success)."""
|
| 69 |
+
if model_ref in self._failures:
|
| 70 |
+
self._failures.pop(model_ref)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
class GlobalRateLimiter:
|
| 74 |
"""
|
| 75 |
Global singleton rate limiter that blocks all requests
|
|
|
|
| 235 |
"""Get remaining reactive wait time in seconds."""
|
| 236 |
return max(0.0, self._blocked_until - time.monotonic())
|
| 237 |
|
| 238 |
+
def record_failure(self, model_ref: str | None = None) -> None:
|
| 239 |
+
"""Record a failure for rate limit tracking.
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
model_ref: Optional model identifier for health tracking.
|
| 243 |
+
"""
|
| 244 |
+
# Record in the shared health tracker if model provided
|
| 245 |
+
if model_ref:
|
| 246 |
+
health = ModelHealthTracker.get_instance()
|
| 247 |
+
health.record_failure(model_ref)
|
| 248 |
+
|
| 249 |
+
def is_healthy(self, model_ref: str | None = None) -> bool:
|
| 250 |
+
"""Check if provider/model is healthy based on failure history.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
model_ref: Optional model identifier for health tracking.
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
True if no recent failures or model_ref is None.
|
| 257 |
+
"""
|
| 258 |
+
if model_ref is None:
|
| 259 |
+
return True
|
| 260 |
+
health = ModelHealthTracker.get_instance()
|
| 261 |
+
return health.is_healthy(model_ref)
|
| 262 |
+
|
| 263 |
@asynccontextmanager
|
| 264 |
async def concurrency_slot(self) -> AsyncIterator[None]:
|
| 265 |
"""Async context manager that holds one concurrency slot for a stream.
|