Yash030 Claude Opus 4.7 commited on
Commit
a5ea640
·
1 Parent(s): fcc5278

Speed up NIM provider with failure tracking and faster timeouts

Browse files

- Add ModelHealthTracker for per-model failure tracking (30s TTL)
- Faster timeouts: connect=10s, first_chunk=30s, fallback=20s
- Auto model pre-checks health before attempting requests
- Record failures on timeout/rate-limit for smart fallback
- Skip unhealthy models in both router and services

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

.claude/settings.local.json CHANGED
@@ -10,7 +10,11 @@
10
  "Bash(git push *)",
11
  "Bash(python -c \"import ast; ast.parse\\(open\\('api/services.py'\\).read\\(\\)\\); print\\('Syntax OK'\\)\")",
12
  "mcp__github__list_issues",
13
- "mcp__github__update_issue"
 
 
 
 
14
  ]
15
  },
16
  "enableAllProjectMcpServers": true,
 
10
  "Bash(git push *)",
11
  "Bash(python -c \"import ast; ast.parse\\(open\\('api/services.py'\\).read\\(\\)\\); print\\('Syntax OK'\\)\")",
12
  "mcp__github__list_issues",
13
+ "mcp__github__update_issue",
14
+ "Bash(git commit *)",
15
+ "Bash(dir \"C:\\\\Users\\\\yashw\\\\.gemini\\\\antigravity\\\\llm_wiki_v2\\\\wiki\\\\entities\")",
16
+ "Bash(dir *)",
17
+ "Bash(node -e ' *)"
18
  ]
19
  },
20
  "enableAllProjectMcpServers": true,
api/__pycache__/detection.cpython-314.pyc CHANGED
Binary files a/api/__pycache__/detection.cpython-314.pyc and b/api/__pycache__/detection.cpython-314.pyc differ
 
api/__pycache__/model_router.cpython-314.pyc CHANGED
Binary files a/api/__pycache__/model_router.cpython-314.pyc and b/api/__pycache__/model_router.cpython-314.pyc differ
 
api/__pycache__/optimization_handlers.cpython-314.pyc CHANGED
Binary files a/api/__pycache__/optimization_handlers.cpython-314.pyc and b/api/__pycache__/optimization_handlers.cpython-314.pyc differ
 
api/__pycache__/routes.cpython-314.pyc CHANGED
Binary files a/api/__pycache__/routes.cpython-314.pyc and b/api/__pycache__/routes.cpython-314.pyc differ
 
api/__pycache__/runtime.cpython-314.pyc CHANGED
Binary files a/api/__pycache__/runtime.cpython-314.pyc and b/api/__pycache__/runtime.cpython-314.pyc differ
 
api/__pycache__/services.cpython-314.pyc CHANGED
Binary files a/api/__pycache__/services.cpython-314.pyc and b/api/__pycache__/services.cpython-314.pyc differ
 
api/model_router.py CHANGED
@@ -196,11 +196,17 @@ class ModelRouter:
196
  if provider_id == "zen":
197
  is_blocked = False
198
 
199
- if is_blocked:
 
 
 
 
200
  logger.debug(
201
- "Routing: candidate '{}' (from {}) is BLOCKED",
202
  normalized_ref,
203
  source,
 
 
204
  )
205
  blocked_candidates.append(resolved)
206
  else:
 
196
  if provider_id == "zen":
197
  is_blocked = False
198
 
199
+ # Check model health (recent failures)
200
+ is_healthy = limiter.is_healthy(normalized_ref)
201
+
202
+ if is_blocked or not is_healthy:
203
+ reason = "BLOCKED" if is_blocked else "UNHEALTHY"
204
  logger.debug(
205
+ "Routing: candidate '{}' (from {}) is {} (health={})",
206
  normalized_ref,
207
  source,
208
+ reason,
209
+ is_healthy,
210
  )
211
  blocked_candidates.append(resolved)
212
  else:
api/services.py CHANGED
@@ -244,18 +244,27 @@ class ClaudeProxyService:
244
 
245
  for i, resolved in enumerate(candidates):
246
  try:
247
- # Pre-check: skip candidates that are currently rate limited
248
  from providers.rate_limit import GlobalRateLimiter
249
 
250
  limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
251
  if limiter.is_blocked() and resolved.provider_id != "zen":
252
  logger.warning(
253
- "Provider '{} is currently rate limited, skipping to next candidate...",
254
  resolved.provider_id,
255
  )
256
  last_exc = Exception("Rate limited")
257
  continue
258
 
 
 
 
 
 
 
 
 
 
259
  provider = self._provider_getter(resolved.provider_id)
260
  routed_request = request_data.model_copy(deep=True)
261
  routed_request.model = resolved.provider_model
@@ -302,6 +311,7 @@ class ClaudeProxyService:
302
  resolved.provider_id,
303
  e.status_code,
304
  )
 
305
  last_exc = e
306
  continue
307
  except TimeoutError as e:
@@ -311,6 +321,7 @@ class ClaudeProxyService:
311
  resolved.provider_id,
312
  type(e).__name__,
313
  )
 
314
  last_exc = e
315
  continue
316
  except Exception as e:
@@ -334,6 +345,7 @@ class ClaudeProxyService:
334
  type(e).__name__,
335
  e,
336
  )
 
337
  last_exc = e
338
  continue
339
 
 
244
 
245
  for i, resolved in enumerate(candidates):
246
  try:
247
+ # Pre-check: skip candidates that are currently rate limited or unhealthy
248
  from providers.rate_limit import GlobalRateLimiter
249
 
250
  limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
251
  if limiter.is_blocked() and resolved.provider_id != "zen":
252
  logger.warning(
253
+ "Provider '{}' is currently rate limited, skipping to next candidate...",
254
  resolved.provider_id,
255
  )
256
  last_exc = Exception("Rate limited")
257
  continue
258
 
259
+ # Check model health (recent failures)
260
+ if not limiter.is_healthy(resolved.provider_model_ref):
261
+ logger.warning(
262
+ "Provider '{}' has recent failures, skipping to next candidate...",
263
+ resolved.provider_model_ref,
264
+ )
265
+ last_exc = Exception("Recent failures")
266
+ continue
267
+
268
  provider = self._provider_getter(resolved.provider_id)
269
  routed_request = request_data.model_copy(deep=True)
270
  routed_request.model = resolved.provider_model
 
311
  resolved.provider_id,
312
  e.status_code,
313
  )
314
+ limiter.record_failure(resolved.provider_model_ref)
315
  last_exc = e
316
  continue
317
  except TimeoutError as e:
 
321
  resolved.provider_id,
322
  type(e).__name__,
323
  )
324
+ limiter.record_failure(resolved.provider_model_ref)
325
  last_exc = e
326
  continue
327
  except Exception as e:
 
345
  type(e).__name__,
346
  e,
347
  )
348
+ limiter.record_failure(resolved.provider_model_ref)
349
  last_exc = e
350
  continue
351
 
core/__pycache__/session_tracker.cpython-314.pyc CHANGED
Binary files a/core/__pycache__/session_tracker.cpython-314.pyc and b/core/__pycache__/session_tracker.cpython-314.pyc differ
 
providers/__pycache__/openai_compat.cpython-314.pyc CHANGED
Binary files a/providers/__pycache__/openai_compat.cpython-314.pyc and b/providers/__pycache__/openai_compat.cpython-314.pyc differ
 
providers/__pycache__/rate_limit.cpython-314.pyc CHANGED
Binary files a/providers/__pycache__/rate_limit.cpython-314.pyc and b/providers/__pycache__/rate_limit.cpython-314.pyc differ
 
providers/nvidia_nim/__pycache__/client.cpython-314.pyc CHANGED
Binary files a/providers/nvidia_nim/__pycache__/client.cpython-314.pyc and b/providers/nvidia_nim/__pycache__/client.cpython-314.pyc differ
 
providers/nvidia_nim/client.py CHANGED
@@ -108,10 +108,10 @@ class NvidiaNimProvider(OpenAIChatTransport):
108
  """
109
  from config.settings import get_settings
110
 
111
- # Reduced timeouts for faster fallback detection
112
- connect_timeout_s = 15 # Reduced from 30
113
- first_chunk_timeout_s = 45 # Reduced from 60
114
- fallback_first_chunk_timeout_s = 30 # Reduced from 60 - faster fallback
115
 
116
  try:
117
  client = self._client_for_body(body)
 
108
  """
109
  from config.settings import get_settings
110
 
111
+ # Faster timeouts for quick failover detection
112
+ connect_timeout_s = 10 # Reduced from 15
113
+ first_chunk_timeout_s = 30 # Reduced from 45
114
+ fallback_first_chunk_timeout_s = 20 # Reduced from 30
115
 
116
  try:
117
  client = self._client_for_body(body)
providers/rate_limit.py CHANGED
@@ -16,6 +16,60 @@ from core.rate_limit import StrictSlidingWindowLimiter
16
  T = TypeVar("T")
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class GlobalRateLimiter:
20
  """
21
  Global singleton rate limiter that blocks all requests
@@ -181,6 +235,31 @@ class GlobalRateLimiter:
181
  """Get remaining reactive wait time in seconds."""
182
  return max(0.0, self._blocked_until - time.monotonic())
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  @asynccontextmanager
185
  async def concurrency_slot(self) -> AsyncIterator[None]:
186
  """Async context manager that holds one concurrency slot for a stream.
 
16
  T = TypeVar("T")
17
 
18
 
19
+ class ModelHealthTracker:
20
+ """Track per-model health based on recent failures."""
21
+
22
+ _instance: ClassVar["ModelHealthTracker | None"] = None
23
+
24
+ def __init__(self, failure_ttl: float = 30.0, max_failures: int = 3) -> None:
25
+ self._failure_ttl = failure_ttl
26
+ self._max_failures = max_failures
27
+ self._failures: dict[str, list[float]] = {}
28
+
29
+ @classmethod
30
+ def get_instance(cls) -> "ModelHealthTracker":
31
+ if cls._instance is None:
32
+ cls._instance = cls()
33
+ return cls._instance
34
+
35
+ def record_failure(self, model_ref: str) -> None:
36
+ """Record a failure timestamp for a model."""
37
+ now = time.monotonic()
38
+ if model_ref not in self._failures:
39
+ self._failures[model_ref] = []
40
+ self._failures[model_ref].append(now)
41
+ logger.debug("HEALTH: recorded failure for '{}'", model_ref)
42
+
43
+ def is_healthy(self, model_ref: str) -> bool:
44
+ """Check if model has had fewer than max_failures in the TTL window."""
45
+ if model_ref not in self._failures:
46
+ return True
47
+ cutoff = time.monotonic() - self._failure_ttl
48
+ recent = [t for t in self._failures[model_ref] if t > cutoff]
49
+ self._failures[model_ref] = recent
50
+ healthy = len(recent) < self._max_failures
51
+ if not healthy:
52
+ logger.debug(
53
+ "HEALTH: model '{}' is unhealthy ({} failures in {}s)",
54
+ model_ref,
55
+ len(recent),
56
+ self._failure_ttl,
57
+ )
58
+ return healthy
59
+
60
+ def get_failure_count(self, model_ref: str) -> int:
61
+ """Get number of recent failures for a model."""
62
+ if model_ref not in self._failures:
63
+ return 0
64
+ cutoff = time.monotonic() - self._failure_ttl
65
+ return len([t for t in self._failures[model_ref] if t > cutoff])
66
+
67
+ def clear_failures(self, model_ref: str) -> None:
68
+ """Clear failure history for a model (on success)."""
69
+ if model_ref in self._failures:
70
+ self._failures.pop(model_ref)
71
+
72
+
73
  class GlobalRateLimiter:
74
  """
75
  Global singleton rate limiter that blocks all requests
 
235
  """Get remaining reactive wait time in seconds."""
236
  return max(0.0, self._blocked_until - time.monotonic())
237
 
238
+ def record_failure(self, model_ref: str | None = None) -> None:
239
+ """Record a failure for rate limit tracking.
240
+
241
+ Args:
242
+ model_ref: Optional model identifier for health tracking.
243
+ """
244
+ # Record in the shared health tracker if model provided
245
+ if model_ref:
246
+ health = ModelHealthTracker.get_instance()
247
+ health.record_failure(model_ref)
248
+
249
+ def is_healthy(self, model_ref: str | None = None) -> bool:
250
+ """Check if provider/model is healthy based on failure history.
251
+
252
+ Args:
253
+ model_ref: Optional model identifier for health tracking.
254
+
255
+ Returns:
256
+ True if no recent failures or model_ref is None.
257
+ """
258
+ if model_ref is None:
259
+ return True
260
+ health = ModelHealthTracker.get_instance()
261
+ return health.is_healthy(model_ref)
262
+
263
  @asynccontextmanager
264
  async def concurrency_slot(self) -> AsyncIterator[None]:
265
  """Async context manager that holds one concurrency slot for a stream.