Mirrowel commited on
Commit
0ca1651
·
1 Parent(s): 98f6823

feat(usage): ✨ implement per-model quota tracking with authoritative reset timestamps

Browse files

This commit introduces granular per-model quota tracking that supports provider-specific reset timestamps from quota exhausted errors.

Key changes:

- Add `quota_reset_timestamp` field to `ClassifiedError` to capture authoritative Unix timestamp from provider's quota exhausted responses
- Implement per-model usage tracking mode where each model maintains its own window with `window_start_ts` and `quota_reset_ts`
- Add quota group support for models that share quota limits (e.g., Claude Sonnet and Opus on Antigravity)
- Parse Antigravity's `quotaResetTimeStamp` ISO format to Unix timestamp for precise reset timing
- Update reset logic to prioritize authoritative `quota_reset_ts` over fallback window calculations
- Distinguish between quota exhausted (sets authoritative reset time) and rate limit (transient cooldown only)
- Migrate Antigravity provider to per-model tracking with 5-hour windows for paid tier and 7-day windows for free tier

The per-model mode enables more accurate quota tracking by using exact reset times from provider error responses rather than estimated windows, preventing premature resets and improving credential utilization.

BREAKING CHANGE: Provider implementations using custom `get_usage_reset_config()` must now return a `mode` field ("per_model" or "credential") instead of `field_name`. The usage data structure has changed from `key_data["field_name"]["models"]` to `key_data["models"]` for per-model tracking. Existing usage data will be preserved but new tracking will use the updated structure.

src/rotator_library/error_handler.py CHANGED
@@ -347,14 +347,26 @@ class ClassifiedError:
347
  original_exception: Exception,
348
  status_code: Optional[int] = None,
349
  retry_after: Optional[int] = None,
 
350
  ):
351
  self.error_type = error_type
352
  self.original_exception = original_exception
353
  self.status_code = status_code
354
  self.retry_after = retry_after
 
 
 
355
 
356
  def __str__(self):
357
- return f"ClassifiedError(type={self.error_type}, status={self.status_code}, retry_after={self.retry_after}, original_exc={self.original_exception})"
 
 
 
 
 
 
 
 
358
 
359
 
360
  def _extract_retry_from_json_body(json_text: str) -> Optional[int]:
@@ -567,6 +579,7 @@ def classify_error(e: Exception, provider: Optional[str] = None) -> ClassifiedEr
567
  retry_after = quota_info["retry_after"]
568
  reason = quota_info.get("reason", "QUOTA_EXHAUSTED")
569
  reset_ts = quota_info.get("reset_timestamp")
 
570
 
571
  # Log the parsed result with human-readable duration
572
  hours = retry_after / 3600
@@ -581,6 +594,7 @@ def classify_error(e: Exception, provider: Optional[str] = None) -> ClassifiedEr
581
  original_exception=e,
582
  status_code=429,
583
  retry_after=retry_after,
 
584
  )
585
  except Exception as parse_error:
586
  lib_logger.debug(
 
347
  original_exception: Exception,
348
  status_code: Optional[int] = None,
349
  retry_after: Optional[int] = None,
350
+ quota_reset_timestamp: Optional[float] = None,
351
  ):
352
  self.error_type = error_type
353
  self.original_exception = original_exception
354
  self.status_code = status_code
355
  self.retry_after = retry_after
356
+ # Unix timestamp when quota resets (from quota_exhausted errors)
357
+ # This is the authoritative reset time parsed from provider's error response
358
+ self.quota_reset_timestamp = quota_reset_timestamp
359
 
360
  def __str__(self):
361
+ parts = [
362
+ f"type={self.error_type}",
363
+ f"status={self.status_code}",
364
+ f"retry_after={self.retry_after}",
365
+ ]
366
+ if self.quota_reset_timestamp:
367
+ parts.append(f"quota_reset_ts={self.quota_reset_timestamp}")
368
+ parts.append(f"original_exc={self.original_exception}")
369
+ return f"ClassifiedError({', '.join(parts)})"
370
 
371
 
372
  def _extract_retry_from_json_body(json_text: str) -> Optional[int]:
 
579
  retry_after = quota_info["retry_after"]
580
  reason = quota_info.get("reason", "QUOTA_EXHAUSTED")
581
  reset_ts = quota_info.get("reset_timestamp")
582
+ quota_reset_timestamp = quota_info.get("quota_reset_timestamp")
583
 
584
  # Log the parsed result with human-readable duration
585
  hours = retry_after / 3600
 
594
  original_exception=e,
595
  status_code=429,
596
  retry_after=retry_after,
597
+ quota_reset_timestamp=quota_reset_timestamp,
598
  )
599
  except Exception as parse_error:
600
  lib_logger.debug(
src/rotator_library/providers/antigravity_provider.py CHANGED
@@ -600,6 +600,7 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
600
  "retry_after": None,
601
  "reason": None,
602
  "reset_timestamp": None,
 
603
  }
604
 
605
  for detail in details:
@@ -626,8 +627,22 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
626
  if parsed:
627
  result["retry_after"] = parsed
628
 
629
- # Capture reset timestamp for logging
630
- result["reset_timestamp"] = metadata.get("quotaResetTimeStamp")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
 
632
  # Return None if we couldn't extract retry_after
633
  if not result["retry_after"]:
@@ -826,45 +841,48 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
826
  """
827
  Get Antigravity-specific usage tracking configuration based on credential tier.
828
 
829
- Antigravity has different quota reset windows by tier:
830
- - Paid tiers (priority 1): 5-hour rolling window
831
- - Free tier (priority 2): 7-day rolling window
832
- - Unknown/legacy: 7-day rolling window (conservative default)
 
 
 
833
 
834
  Args:
835
  credential: The credential path
836
 
837
  Returns:
838
- Usage reset configuration dict
839
  """
840
  tier = self.project_tier_cache.get(credential)
841
  if not tier:
842
  tier = self._load_tier_from_file(credential)
843
 
844
- # Paid tiers: 5-hour window
845
  if tier and tier not in ["free-tier", "legacy-tier", "unknown"]:
846
  return {
847
  "window_seconds": 5 * 60 * 60, # 18000 seconds = 5 hours
848
- "field_name": "5h_window",
849
  "priority": 1,
850
- "description": "5-hour rolling window (paid tier)",
851
  }
852
 
853
- # Free tier: 7-day window
854
  if tier == "free-tier":
855
  return {
856
  "window_seconds": 7 * 24 * 60 * 60, # 604800 seconds = 7 days
857
- "field_name": "weekly",
858
  "priority": 2,
859
- "description": "7-day rolling window (free tier)",
860
  }
861
 
862
- # Unknown/legacy: use 7-day window as conservative default
863
  return {
864
  "window_seconds": 7 * 24 * 60 * 60, # 604800 seconds = 7 days
865
- "field_name": "weekly",
866
  "priority": 10,
867
- "description": "7-day rolling window (unknown tier - conservative default)",
868
  }
869
 
870
  def get_default_usage_field_name(self) -> str:
@@ -872,9 +890,51 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
872
  Get the default usage tracking field name for Antigravity.
873
 
874
  Returns:
875
- "weekly" as the conservative default for unknown credentials
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  """
877
- return "weekly"
878
 
879
  async def initialize_credentials(self, credential_paths: List[str]) -> None:
880
  """
 
600
  "retry_after": None,
601
  "reason": None,
602
  "reset_timestamp": None,
603
+ "quota_reset_timestamp": None, # Unix timestamp for quota reset
604
  }
605
 
606
  for detail in details:
 
627
  if parsed:
628
  result["retry_after"] = parsed
629
 
630
+ # Capture reset timestamp for logging and authoritative reset time
631
+ reset_ts_str = metadata.get("quotaResetTimeStamp")
632
+ result["reset_timestamp"] = reset_ts_str
633
+
634
+ # Parse ISO timestamp to Unix timestamp for usage tracking
635
+ if reset_ts_str:
636
+ try:
637
+ # Handle ISO format: "2025-12-11T22:53:16Z"
638
+ reset_dt = datetime.fromisoformat(
639
+ reset_ts_str.replace("Z", "+00:00")
640
+ )
641
+ result["quota_reset_timestamp"] = reset_dt.timestamp()
642
+ except (ValueError, AttributeError) as e:
643
+ lib_logger.warning(
644
+ f"Failed to parse quota reset timestamp '{reset_ts_str}': {e}"
645
+ )
646
 
647
  # Return None if we couldn't extract retry_after
648
  if not result["retry_after"]:
 
841
  """
842
  Get Antigravity-specific usage tracking configuration based on credential tier.
843
 
844
+ Antigravity uses per-model windows with different durations by tier:
845
+ - Paid tiers (priority 1): 5-hour per-model window
846
+ - Free tier (priority 2): 7-day per-model window
847
+ - Unknown/legacy: 7-day per-model window (conservative default)
848
+
849
+ When a model hits a quota_exhausted 429 error with exact reset timestamp,
850
+ that timestamp becomes the authoritative reset time for the model (and its group).
851
 
852
  Args:
853
  credential: The credential path
854
 
855
  Returns:
856
+ Usage reset configuration dict with mode="per_model"
857
  """
858
  tier = self.project_tier_cache.get(credential)
859
  if not tier:
860
  tier = self._load_tier_from_file(credential)
861
 
862
+ # Paid tiers: 5-hour per-model window
863
  if tier and tier not in ["free-tier", "legacy-tier", "unknown"]:
864
  return {
865
  "window_seconds": 5 * 60 * 60, # 18000 seconds = 5 hours
866
+ "mode": "per_model",
867
  "priority": 1,
868
+ "description": "5-hour per-model window (paid tier)",
869
  }
870
 
871
+ # Free tier: 7-day per-model window
872
  if tier == "free-tier":
873
  return {
874
  "window_seconds": 7 * 24 * 60 * 60, # 604800 seconds = 7 days
875
+ "mode": "per_model",
876
  "priority": 2,
877
+ "description": "7-day per-model window (free tier)",
878
  }
879
 
880
+ # Unknown/legacy: use 7-day per-model window as conservative default
881
  return {
882
  "window_seconds": 7 * 24 * 60 * 60, # 604800 seconds = 7 days
883
+ "mode": "per_model",
884
  "priority": 10,
885
+ "description": "7-day per-model window (unknown tier - conservative default)",
886
  }
887
 
888
  def get_default_usage_field_name(self) -> str:
 
890
  Get the default usage tracking field name for Antigravity.
891
 
892
  Returns:
893
+ "models" for per-model tracking
894
+ """
895
+ return "models"
896
+
897
+ # =========================================================================
898
+ # Model Quota Grouping
899
+ # =========================================================================
900
+
901
+ # Models that share quota timing - when one hits quota, all get same reset time
902
+ QUOTA_GROUPS = {
903
+ # Future: add claude/gemini groups if they share quota
904
+ }
905
+
906
+ def get_model_quota_group(self, model: str) -> Optional[str]:
907
+ """
908
+ Returns the quota group name for a model.
909
+
910
+ Claude models (sonnet and opus) share quota on Antigravity.
911
+ When one hits quota exhausted, all models in the group get the same reset time.
912
+
913
+ Args:
914
+ model: Model name (with or without "antigravity/" prefix)
915
+
916
+ Returns:
917
+ Group name ("claude") or None if not grouped
918
+ """
919
+ # Remove provider prefix if present
920
+ clean_model = model.replace("antigravity/", "")
921
+
922
+ for group_name, models in self.QUOTA_GROUPS.items():
923
+ if clean_model in models:
924
+ return group_name
925
+ return None
926
+
927
+ def get_models_in_quota_group(self, group: str) -> List[str]:
928
+ """
929
+ Returns all model names in a quota group.
930
+
931
+ Args:
932
+ group: Group name (e.g., "claude")
933
+
934
+ Returns:
935
+ List of model names (without provider prefix)
936
  """
937
+ return self.QUOTA_GROUPS.get(group, [])
938
 
939
  async def initialize_credentials(self, credential_paths: List[str]) -> None:
940
  """
src/rotator_library/providers/provider_interface.py CHANGED
@@ -202,6 +202,7 @@ class ProviderInterface(ABC):
202
  "retry_after": int, # seconds until quota resets
203
  "reason": str, # e.g., "QUOTA_EXHAUSTED", "RATE_LIMITED"
204
  "reset_timestamp": str | None, # ISO timestamp if available
 
205
  }
206
  """
207
  return None # Default: no provider-specific parsing
@@ -218,9 +219,9 @@ class ProviderInterface(ABC):
218
  credential tier (e.g., paid vs free accounts with different quota periods).
219
 
220
  The UsageManager will use this configuration to:
221
- 1. Track usage in a custom-named field (instead of default "daily")
222
- 2. Reset usage based on a rolling window from first request
223
- 3. Archive stats to "global" when the window expires
224
 
225
  Args:
226
  credential: The credential identifier (API key or path)
@@ -229,32 +230,35 @@ class ProviderInterface(ABC):
229
  None to use default daily reset, otherwise a dict with:
230
  {
231
  "window_seconds": int, # Duration in seconds (e.g., 18000 for 5h)
232
- "field_name": str, # Custom field name (e.g., "5h_window", "weekly")
233
- "priority": int, # Priority level this config applies to (for docs)
234
  "description": str, # Human-readable description (for logging)
235
  }
236
 
 
 
 
 
 
 
 
 
237
  Examples:
238
- Antigravity paid tier:
239
  {
240
  "window_seconds": 18000, # 5 hours
241
- "field_name": "5h_window",
242
  "priority": 1,
243
- "description": "5-hour rolling window (paid tier)"
244
  }
245
 
246
- Antigravity free tier:
247
  {
248
- "window_seconds": 604800, # 7 days
249
- "field_name": "weekly",
250
- "priority": 2,
251
- "description": "7-day rolling window (free tier)"
252
  }
253
-
254
- Note:
255
- - window_seconds: Time from first request until stats reset
256
- - When window expires, stats move to "global" (same as daily reset)
257
- - First request after window expiry starts a new window
258
  """
259
  return None # Default: use daily reset at daily_reset_time_utc
260
 
@@ -269,3 +273,39 @@ class ProviderInterface(ABC):
269
  Field name string (default: "daily")
270
  """
271
  return "daily"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  "retry_after": int, # seconds until quota resets
203
  "reason": str, # e.g., "QUOTA_EXHAUSTED", "RATE_LIMITED"
204
  "reset_timestamp": str | None, # ISO timestamp if available
205
+ "quota_reset_timestamp": float | None, # Unix timestamp for quota reset
206
  }
207
  """
208
  return None # Default: no provider-specific parsing
 
219
  credential tier (e.g., paid vs free accounts with different quota periods).
220
 
221
  The UsageManager will use this configuration to:
222
+ 1. Track usage per-model or per-credential based on mode
223
+ 2. Reset usage based on a rolling window OR quota exhausted timestamp
224
+ 3. Archive stats to "global" when the window/quota expires
225
 
226
  Args:
227
  credential: The credential identifier (API key or path)
 
230
  None to use default daily reset, otherwise a dict with:
231
  {
232
  "window_seconds": int, # Duration in seconds (e.g., 18000 for 5h)
233
+ "mode": str, # "credential" or "per_model"
234
+ "priority": int, # Priority level this config applies to
235
  "description": str, # Human-readable description (for logging)
236
  }
237
 
238
+ Modes:
239
+ - "credential": One window per credential. Window starts from first
240
+ request of ANY model. All models reset together when window expires.
241
+ - "per_model": Separate window per model (or model group). Window starts
242
+ from first request of THAT model. Models reset independently unless
243
+ grouped. If a quota_exhausted error provides exact reset time, that
244
+ becomes the authoritative reset time for the model.
245
+
246
  Examples:
247
+ Antigravity paid tier (per-model):
248
  {
249
  "window_seconds": 18000, # 5 hours
250
+ "mode": "per_model",
251
  "priority": 1,
252
+ "description": "5-hour per-model window (paid tier)"
253
  }
254
 
255
+ Default provider (credential-level):
256
  {
257
+ "window_seconds": 86400, # 24 hours
258
+ "mode": "credential",
259
+ "priority": 1,
260
+ "description": "24-hour credential window"
261
  }
 
 
 
 
 
262
  """
263
  return None # Default: use daily reset at daily_reset_time_utc
264
 
 
273
  Field name string (default: "daily")
274
  """
275
  return "daily"
276
+
277
+ # =========================================================================
278
+ # Model Quota Grouping
279
+ # =========================================================================
280
+
281
+ def get_model_quota_group(self, model: str) -> Optional[str]:
282
+ """
283
+ Returns the quota group name for a model, or None if not grouped.
284
+
285
+ Models in the same quota group share cooldown timing - when one model
286
+ hits a quota exhausted error, all models in the group get the same
287
+ reset timestamp. They also reset (archive stats) together.
288
+
289
+ This is useful for providers where multiple model variants share the
290
+ same underlying quota (e.g., Claude Sonnet and Opus on Antigravity).
291
+
292
+ Args:
293
+ model: Model name (with or without provider prefix)
294
+
295
+ Returns:
296
+ Group name string (e.g., "claude") or None if model is not grouped
297
+ """
298
+ return None
299
+
300
+ def get_models_in_quota_group(self, group: str) -> List[str]:
301
+ """
302
+ Returns all model names that belong to a quota group.
303
+
304
+ Args:
305
+ group: Group name (e.g., "claude")
306
+
307
+ Returns:
308
+ List of model names (WITHOUT provider prefix) in the group.
309
+ Empty list if group doesn't exist.
310
+ """
311
+ return []
src/rotator_library/usage_manager.py CHANGED
@@ -162,6 +162,69 @@ class UsageManager:
162
 
163
  return None
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  def _get_usage_field_name(self, credential: str) -> str:
166
  """
167
  Get the usage tracking field name for a credential.
@@ -190,27 +253,36 @@ class UsageManager:
190
 
191
  def _get_usage_count(self, key: str, model: str) -> int:
192
  """
193
- Get the current usage count for a model from the appropriate usage field.
 
 
 
 
194
 
195
  Args:
196
  key: Credential identifier
197
  model: Model name
198
 
199
  Returns:
200
- Usage count (success_count) for the model in the current window/daily period
201
  """
202
  if self._usage_data is None:
203
  return 0
204
 
205
  key_data = self._usage_data.get(key, {})
206
- usage_field = self._get_usage_field_name(key)
207
 
208
- return (
209
- key_data.get(usage_field, {})
210
- .get("models", {})
211
- .get(model, {})
212
- .get("success_count", 0)
213
- )
 
 
 
 
 
214
 
215
  def _select_sequential(
216
  self,
@@ -299,9 +371,10 @@ class UsageManager:
299
  """
300
  Checks if usage stats need to be reset for any key.
301
 
302
- Supports two reset modes:
303
- 1. Provider-specific rolling windows (e.g., 5h for Antigravity paid, 7d for free)
304
- 2. Legacy daily reset at daily_reset_time_utc for providers without custom config
 
305
  """
306
  if self._usage_data is None:
307
  return
@@ -312,16 +385,23 @@ class UsageManager:
312
  needs_saving = False
313
 
314
  for key, data in self._usage_data.items():
315
- # Check for provider-specific reset configuration
316
  reset_config = self._get_usage_reset_config(key)
317
 
318
  if reset_config:
319
- # Provider-specific rolling window reset
320
- needs_saving |= await self._check_window_reset(
321
- key, data, reset_config, now_ts
322
- )
 
 
 
 
 
 
 
 
323
  elif self.daily_reset_time_utc:
324
- # Legacy daily reset for providers without custom config
325
  needs_saving |= await self._check_daily_reset(
326
  key, data, now_utc, today_str, now_ts
327
  )
@@ -329,6 +409,170 @@ class UsageManager:
329
  if needs_saving:
330
  await self._save_usage()
331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  async def _check_window_reset(
333
  self,
334
  key: str,
@@ -948,36 +1192,67 @@ class UsageManager:
948
  Records a successful API call, resetting failure counters.
949
  It safely handles cases where token usage data is not available.
950
 
951
- Uses provider-specific field names for usage tracking (e.g., "5h_window", "weekly")
952
- and sets window start timestamp on first request.
 
953
  """
954
  await self._lazy_init()
955
  async with self._data_lock:
956
  now_ts = time.time()
957
  today_utc_str = datetime.now(timezone.utc).date().isoformat()
958
 
959
- # Determine the usage field name for this credential
960
- usage_field = self._get_usage_field_name(key)
961
  reset_config = self._get_usage_reset_config(key)
962
- uses_window = reset_config is not None
 
 
963
 
964
- # Initialize key data with appropriate structure
965
- if uses_window:
966
- # Provider-specific rolling window
967
  key_data = self._usage_data.setdefault(
968
  key,
969
  {
970
- usage_field: {"start_ts": None, "models": {}},
971
  "global": {"models": {}},
972
  "model_cooldowns": {},
973
  "failures": {},
974
  },
975
  )
976
- # Ensure the usage field exists (for migration from old format)
977
- if usage_field not in key_data:
978
- key_data[usage_field] = {"start_ts": None, "models": {}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
  else:
980
- # Legacy daily reset
981
  key_data = self._usage_data.setdefault(
982
  key,
983
  {
@@ -987,57 +1262,41 @@ class UsageManager:
987
  "failures": {},
988
  },
989
  )
990
- usage_field = "daily"
991
 
992
- # If the key is new, ensure its reset date is initialized to prevent an immediate reset.
993
- if not uses_window and "last_daily_reset" not in key_data:
994
- key_data["last_daily_reset"] = today_utc_str
995
 
996
- # Always record a success and reset failures
 
 
 
 
 
 
 
 
 
 
 
 
997
  model_failures = key_data.setdefault("failures", {}).setdefault(model, {})
998
  model_failures["consecutive_failures"] = 0
 
 
999
  if model in key_data.get("model_cooldowns", {}):
1000
  del key_data["model_cooldowns"][model]
1001
 
1002
- # Get or create the usage field data
1003
- usage_data = key_data.setdefault(usage_field, {"models": {}})
1004
-
1005
- # For window-based tracking, set start_ts on first request
1006
- if uses_window:
1007
- if usage_data.get("start_ts") is None:
1008
- usage_data["start_ts"] = now_ts
1009
- window_hours = reset_config.get("window_seconds", 0) / 3600
1010
- description = reset_config.get("description", "rolling window")
1011
- lib_logger.info(
1012
- f"Starting new {window_hours:.1f}h window for {mask_credential(key)} - {description}"
1013
- )
1014
-
1015
- # Ensure models dict exists
1016
- if "models" not in usage_data:
1017
- usage_data["models"] = {}
1018
-
1019
- model_data = usage_data["models"].setdefault(
1020
- model,
1021
- {
1022
- "success_count": 0,
1023
- "prompt_tokens": 0,
1024
- "completion_tokens": 0,
1025
- "approx_cost": 0.0,
1026
- },
1027
- )
1028
- model_data["success_count"] += 1
1029
-
1030
- # Safely attempt to record token and cost usage
1031
  if (
1032
  completion_response
1033
  and hasattr(completion_response, "usage")
1034
  and completion_response.usage
1035
  ):
1036
  usage = completion_response.usage
1037
- model_data["prompt_tokens"] += usage.prompt_tokens
1038
- model_data["completion_tokens"] += getattr(
1039
  usage, "completion_tokens", 0
1040
- ) # Not present in embedding responses
1041
  lib_logger.info(
1042
  f"Recorded usage from response object for key {mask_credential(key)}"
1043
  )
@@ -1045,7 +1304,6 @@ class UsageManager:
1045
  provider_name = model.split("/")[0]
1046
  provider_plugin = self.provider_plugins.get(provider_name)
1047
 
1048
- # Check class attribute directly - no need to instantiate
1049
  if provider_plugin and getattr(
1050
  provider_plugin, "skip_cost_calculation", False
1051
  ):
@@ -1053,9 +1311,7 @@ class UsageManager:
1053
  f"Skipping cost calculation for provider '{provider_name}' (custom provider)."
1054
  )
1055
  else:
1056
- # Differentiate cost calculation based on response type
1057
  if isinstance(completion_response, litellm.EmbeddingResponse):
1058
- # Manually calculate cost for embeddings
1059
  model_info = litellm.get_model_info(model)
1060
  input_cost = model_info.get("input_cost_per_token")
1061
  if input_cost:
@@ -1070,7 +1326,7 @@ class UsageManager:
1070
  )
1071
 
1072
  if cost is not None:
1073
- model_data["approx_cost"] += cost
1074
  except Exception as e:
1075
  lib_logger.warning(
1076
  f"Could not calculate cost for model {model}: {e}"
@@ -1078,8 +1334,7 @@ class UsageManager:
1078
  elif isinstance(completion_response, asyncio.Future) or hasattr(
1079
  completion_response, "__aiter__"
1080
  ):
1081
- # This is an unconsumed stream object. Do not log a warning, as usage will be recorded from the chunks.
1082
- pass
1083
  else:
1084
  lib_logger.warning(
1085
  f"No usage data found in completion response for model {model}. Recording success without token count."
@@ -1096,7 +1351,13 @@ class UsageManager:
1096
  classified_error: ClassifiedError,
1097
  increment_consecutive_failures: bool = True,
1098
  ):
1099
- """Records a failure and applies cooldowns based on an escalating backoff strategy.
 
 
 
 
 
 
1100
 
1101
  Args:
1102
  key: The API key or credential identifier
@@ -1107,19 +1368,20 @@ class UsageManager:
1107
  """
1108
  await self._lazy_init()
1109
  async with self._data_lock:
 
1110
  today_utc_str = datetime.now(timezone.utc).date().isoformat()
1111
 
1112
- # Determine the usage field name for this credential
1113
- usage_field = self._get_usage_field_name(key)
1114
  reset_config = self._get_usage_reset_config(key)
1115
- uses_window = reset_config is not None
 
 
1116
 
1117
  # Initialize key data with appropriate structure
1118
- if uses_window:
1119
  key_data = self._usage_data.setdefault(
1120
  key,
1121
  {
1122
- usage_field: {"start_ts": None, "models": {}},
1123
  "global": {"models": {}},
1124
  "model_cooldowns": {},
1125
  "failures": {},
@@ -1147,36 +1409,94 @@ class UsageManager:
1147
 
1148
  # Calculate cooldown duration based on error type
1149
  cooldown_seconds = None
 
1150
 
1151
- if classified_error.error_type in ["rate_limit", "quota_exceeded"]:
1152
- # Rate limit / Quota errors: use retry_after if available, otherwise default to 60s
 
1153
  cooldown_seconds = classified_error.retry_after or 60
1154
- if classified_error.retry_after:
1155
- # Log with human-readable duration for provider-parsed cooldowns
1156
- hours = cooldown_seconds / 3600
1157
- if hours >= 1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1158
  lib_logger.info(
1159
- f"Quota/rate limit on key {mask_credential(key)} for model {model}. "
1160
- f"Applying provider-specified cooldown: {cooldown_seconds}s ({hours:.1f}h)"
1161
  )
1162
  else:
 
 
 
 
1163
  lib_logger.info(
1164
- f"Rate limit on key {mask_credential(key)} for model {model}. "
1165
- f"Applying provider-specified cooldown: {cooldown_seconds}s"
1166
  )
 
 
 
1167
  else:
 
 
 
1168
  lib_logger.info(
1169
- f"Rate limit on key {mask_credential(key)} for model {model}. "
1170
- f"Using default cooldown: {cooldown_seconds}s"
1171
  )
 
 
 
 
 
 
 
 
 
 
1172
  elif classified_error.error_type == "authentication":
1173
  # Apply a 5-minute key-level lockout for auth errors
1174
- key_data["key_cooldown_until"] = time.time() + 300
 
 
1175
  lib_logger.warning(
1176
  f"Authentication error on key {mask_credential(key)}. Applying 5-minute key-level lockout."
1177
  )
1178
- # Auth errors still use escalating backoff for the specific model
1179
- cooldown_seconds = 300 # 5 minutes for model cooldown
1180
 
1181
  # If we should increment failures, calculate escalating backoff
1182
  if should_increment:
@@ -1190,35 +1510,27 @@ class UsageManager:
1190
  # If cooldown wasn't set by specific error type, use escalating backoff
1191
  if cooldown_seconds is None:
1192
  backoff_tiers = {1: 10, 2: 30, 3: 60, 4: 120}
1193
- cooldown_seconds = backoff_tiers.get(
1194
- count, 7200
1195
- ) # Default to 2 hours for "spent" keys
1196
  lib_logger.warning(
1197
  f"Failure #{count} for key {mask_credential(key)} with model {model}. "
1198
- f"Error type: {classified_error.error_type}"
1199
  )
1200
  else:
1201
  # Provider-level errors: apply short cooldown but don't count against key
1202
  if cooldown_seconds is None:
1203
- cooldown_seconds = 30 # 30s cooldown for provider issues
 
1204
  lib_logger.info(
1205
- f"Provider-level error ({classified_error.error_type}) for key {mask_credential(key)} with model {model}. "
1206
- f"NOT incrementing consecutive failures. Applying {cooldown_seconds}s cooldown."
1207
  )
1208
 
1209
- # Apply the cooldown
1210
- model_cooldowns = key_data.setdefault("model_cooldowns", {})
1211
- model_cooldowns[model] = time.time() + cooldown_seconds
1212
- lib_logger.warning(
1213
- f"Cooldown applied for key {mask_credential(key)} with model {model}: {cooldown_seconds}s. "
1214
- f"Error type: {classified_error.error_type}"
1215
- )
1216
-
1217
  # Check for key-level lockout condition
1218
  await self._check_key_lockout(key, key_data)
1219
 
1220
  key_data["last_failure"] = {
1221
- "timestamp": time.time(),
1222
  "model": model,
1223
  "error": str(classified_error.original_exception),
1224
  }
 
162
 
163
  return None
164
 
165
+ def _get_reset_mode(self, credential: str) -> str:
166
+ """
167
+ Get the reset mode for a credential: 'credential' or 'per_model'.
168
+
169
+ Args:
170
+ credential: The credential identifier
171
+
172
+ Returns:
173
+ "per_model" or "credential" (default)
174
+ """
175
+ config = self._get_usage_reset_config(credential)
176
+ return config.get("mode", "credential") if config else "credential"
177
+
178
+ def _get_model_quota_group(self, credential: str, model: str) -> Optional[str]:
179
+ """
180
+ Get the quota group for a model, if the provider defines one.
181
+
182
+ Args:
183
+ credential: The credential identifier
184
+ model: Model name (with or without provider prefix)
185
+
186
+ Returns:
187
+ Group name (e.g., "claude") or None if not grouped
188
+ """
189
+ provider = self._get_provider_from_credential(credential)
190
+ if not provider:
191
+ return None
192
+
193
+ plugin = self.provider_plugins.get(provider)
194
+ if not plugin:
195
+ return None
196
+
197
+ if hasattr(plugin, "get_model_quota_group"):
198
+ return plugin.get_model_quota_group(model)
199
+
200
+ return None
201
+
202
+ def _get_grouped_models(self, credential: str, group: str) -> List[str]:
203
+ """
204
+ Get all model names in a quota group (with provider prefix).
205
+
206
+ Args:
207
+ credential: The credential identifier
208
+ group: Group name (e.g., "claude")
209
+
210
+ Returns:
211
+ List of full model names (e.g., ["antigravity/claude-opus-4-5", ...])
212
+ """
213
+ provider = self._get_provider_from_credential(credential)
214
+ if not provider:
215
+ return []
216
+
217
+ plugin = self.provider_plugins.get(provider)
218
+ if not plugin:
219
+ return []
220
+
221
+ if hasattr(plugin, "get_models_in_quota_group"):
222
+ models = plugin.get_models_in_quota_group(group)
223
+ # Add provider prefix
224
+ return [f"{provider}/{m}" for m in models]
225
+
226
+ return []
227
+
228
  def _get_usage_field_name(self, credential: str) -> str:
229
  """
230
  Get the usage tracking field name for a credential.
 
253
 
254
  def _get_usage_count(self, key: str, model: str) -> int:
255
  """
256
+ Get the current usage count for a model from the appropriate usage structure.
257
+
258
+ Supports both:
259
+ - New per-model structure: {"models": {"model_name": {"success_count": N, ...}}}
260
+ - Legacy structure: {"daily": {"models": {"model_name": {"success_count": N, ...}}}}
261
 
262
  Args:
263
  key: Credential identifier
264
  model: Model name
265
 
266
  Returns:
267
+ Usage count (success_count) for the model in the current window/period
268
  """
269
  if self._usage_data is None:
270
  return 0
271
 
272
  key_data = self._usage_data.get(key, {})
273
+ reset_mode = self._get_reset_mode(key)
274
 
275
+ if reset_mode == "per_model":
276
+ # New per-model structure: key_data["models"][model]["success_count"]
277
+ return key_data.get("models", {}).get(model, {}).get("success_count", 0)
278
+ else:
279
+ # Legacy structure: key_data["daily"]["models"][model]["success_count"]
280
+ return (
281
+ key_data.get("daily", {})
282
+ .get("models", {})
283
+ .get(model, {})
284
+ .get("success_count", 0)
285
+ )
286
 
287
  def _select_sequential(
288
  self,
 
371
  """
372
  Checks if usage stats need to be reset for any key.
373
 
374
+ Supports three reset modes:
375
+ 1. per_model: Each model has its own window, resets based on quota_reset_ts or fallback window
376
+ 2. credential: One window per credential (legacy with custom window duration)
377
+ 3. daily: Legacy daily reset at daily_reset_time_utc
378
  """
379
  if self._usage_data is None:
380
  return
 
385
  needs_saving = False
386
 
387
  for key, data in self._usage_data.items():
 
388
  reset_config = self._get_usage_reset_config(key)
389
 
390
  if reset_config:
391
+ reset_mode = reset_config.get("mode", "credential")
392
+
393
+ if reset_mode == "per_model":
394
+ # Per-model window reset
395
+ needs_saving |= await self._check_per_model_resets(
396
+ key, data, reset_config, now_ts
397
+ )
398
+ else:
399
+ # Credential-level window reset (legacy)
400
+ needs_saving |= await self._check_window_reset(
401
+ key, data, reset_config, now_ts
402
+ )
403
  elif self.daily_reset_time_utc:
404
+ # Legacy daily reset
405
  needs_saving |= await self._check_daily_reset(
406
  key, data, now_utc, today_str, now_ts
407
  )
 
409
  if needs_saving:
410
  await self._save_usage()
411
 
412
+ async def _check_per_model_resets(
413
+ self,
414
+ key: str,
415
+ data: Dict[str, Any],
416
+ reset_config: Dict[str, Any],
417
+ now_ts: float,
418
+ ) -> bool:
419
+ """
420
+ Check and perform per-model resets for a credential.
421
+
422
+ Each model resets independently based on:
423
+ 1. quota_reset_ts (authoritative, from quota exhausted error) if set
424
+ 2. window_start_ts + window_seconds (fallback) otherwise
425
+
426
+ Grouped models reset together - all models in a group must be ready.
427
+
428
+ Args:
429
+ key: Credential identifier
430
+ data: Usage data for this credential
431
+ reset_config: Provider's reset configuration
432
+ now_ts: Current timestamp
433
+
434
+ Returns:
435
+ True if data was modified and needs saving
436
+ """
437
+ window_seconds = reset_config.get("window_seconds", 86400)
438
+ models_data = data.get("models", {})
439
+
440
+ if not models_data:
441
+ return False
442
+
443
+ modified = False
444
+ processed_groups = set()
445
+
446
+ for model, model_data in list(models_data.items()):
447
+ # Check if this model is in a quota group
448
+ group = self._get_model_quota_group(key, model)
449
+
450
+ if group:
451
+ if group in processed_groups:
452
+ continue # Already handled this group
453
+
454
+ # Check if entire group should reset
455
+ if self._should_group_reset(
456
+ key, group, models_data, window_seconds, now_ts
457
+ ):
458
+ # Archive and reset all models in group
459
+ grouped_models = self._get_grouped_models(key, group)
460
+ archived_count = 0
461
+
462
+ for grouped_model in grouped_models:
463
+ if grouped_model in models_data:
464
+ gm_data = models_data[grouped_model]
465
+ self._archive_model_to_global(data, grouped_model, gm_data)
466
+ self._reset_model_data(gm_data)
467
+ archived_count += 1
468
+
469
+ if archived_count > 0:
470
+ lib_logger.info(
471
+ f"Reset model group '{group}' ({archived_count} models) for {mask_credential(key)}"
472
+ )
473
+ modified = True
474
+
475
+ processed_groups.add(group)
476
+
477
+ else:
478
+ # Ungrouped model - check individually
479
+ if self._should_model_reset(model_data, window_seconds, now_ts):
480
+ self._archive_model_to_global(data, model, model_data)
481
+ self._reset_model_data(model_data)
482
+ lib_logger.info(f"Reset model {model} for {mask_credential(key)}")
483
+ modified = True
484
+
485
+ # Preserve unexpired cooldowns
486
+ if modified:
487
+ self._preserve_unexpired_cooldowns(key, data, now_ts)
488
+ if "failures" in data:
489
+ data["failures"] = {}
490
+
491
+ return modified
492
+
493
+ def _should_model_reset(
494
+ self, model_data: Dict[str, Any], window_seconds: int, now_ts: float
495
+ ) -> bool:
496
+ """
497
+ Check if a single model should reset.
498
+
499
+ Returns True if:
500
+ - quota_reset_ts is set AND now >= quota_reset_ts, OR
501
+ - quota_reset_ts is NOT set AND now >= window_start_ts + window_seconds
502
+ """
503
+ quota_reset = model_data.get("quota_reset_ts")
504
+ window_start = model_data.get("window_start_ts")
505
+
506
+ if quota_reset:
507
+ return now_ts >= quota_reset
508
+ elif window_start:
509
+ return now_ts >= window_start + window_seconds
510
+ return False
511
+
512
+ def _should_group_reset(
513
+ self,
514
+ key: str,
515
+ group: str,
516
+ models_data: Dict[str, Dict],
517
+ window_seconds: int,
518
+ now_ts: float,
519
+ ) -> bool:
520
+ """
521
+ Check if all models in a group should reset.
522
+
523
+ All models in the group must be ready to reset.
524
+ If any model has an active cooldown/window, the whole group waits.
525
+ """
526
+ grouped_models = self._get_grouped_models(key, group)
527
+
528
+ # Track if any model in group has data
529
+ any_has_data = False
530
+
531
+ for grouped_model in grouped_models:
532
+ model_data = models_data.get(grouped_model, {})
533
+
534
+ if not model_data or (
535
+ model_data.get("window_start_ts") is None
536
+ and model_data.get("success_count", 0) == 0
537
+ ):
538
+ continue # No stats for this model yet
539
+
540
+ any_has_data = True
541
+
542
+ if not self._should_model_reset(model_data, window_seconds, now_ts):
543
+ return False # At least one model not ready
544
+
545
+ return any_has_data
546
+
547
+ def _archive_model_to_global(
548
+ self, data: Dict[str, Any], model: str, model_data: Dict[str, Any]
549
+ ) -> None:
550
+ """Archive a single model's stats to global."""
551
+ global_data = data.setdefault("global", {"models": {}})
552
+ global_model = global_data["models"].setdefault(
553
+ model,
554
+ {
555
+ "success_count": 0,
556
+ "prompt_tokens": 0,
557
+ "completion_tokens": 0,
558
+ "approx_cost": 0.0,
559
+ },
560
+ )
561
+
562
+ global_model["success_count"] += model_data.get("success_count", 0)
563
+ global_model["prompt_tokens"] += model_data.get("prompt_tokens", 0)
564
+ global_model["completion_tokens"] += model_data.get("completion_tokens", 0)
565
+ global_model["approx_cost"] += model_data.get("approx_cost", 0.0)
566
+
567
+ def _reset_model_data(self, model_data: Dict[str, Any]) -> None:
568
+ """Reset a model's window and stats."""
569
+ model_data["window_start_ts"] = None
570
+ model_data["quota_reset_ts"] = None
571
+ model_data["success_count"] = 0
572
+ model_data["prompt_tokens"] = 0
573
+ model_data["completion_tokens"] = 0
574
+ model_data["approx_cost"] = 0.0
575
+
576
  async def _check_window_reset(
577
  self,
578
  key: str,
 
1192
  Records a successful API call, resetting failure counters.
1193
  It safely handles cases where token usage data is not available.
1194
 
1195
+ Supports two modes based on provider configuration:
1196
+ - per_model: Each model has its own window_start_ts and stats in key_data["models"]
1197
+ - credential: Legacy mode with key_data["daily"]["models"]
1198
  """
1199
  await self._lazy_init()
1200
  async with self._data_lock:
1201
  now_ts = time.time()
1202
  today_utc_str = datetime.now(timezone.utc).date().isoformat()
1203
 
 
 
1204
  reset_config = self._get_usage_reset_config(key)
1205
+ reset_mode = (
1206
+ reset_config.get("mode", "credential") if reset_config else "credential"
1207
+ )
1208
 
1209
+ if reset_mode == "per_model":
1210
+ # New per-model structure
 
1211
  key_data = self._usage_data.setdefault(
1212
  key,
1213
  {
1214
+ "models": {},
1215
  "global": {"models": {}},
1216
  "model_cooldowns": {},
1217
  "failures": {},
1218
  },
1219
  )
1220
+
1221
+ # Ensure models dict exists
1222
+ if "models" not in key_data:
1223
+ key_data["models"] = {}
1224
+
1225
+ # Get or create per-model data with window tracking
1226
+ model_data = key_data["models"].setdefault(
1227
+ model,
1228
+ {
1229
+ "window_start_ts": None,
1230
+ "quota_reset_ts": None,
1231
+ "success_count": 0,
1232
+ "prompt_tokens": 0,
1233
+ "completion_tokens": 0,
1234
+ "approx_cost": 0.0,
1235
+ },
1236
+ )
1237
+
1238
+ # Start window on first request for this model
1239
+ if model_data.get("window_start_ts") is None:
1240
+ model_data["window_start_ts"] = now_ts
1241
+ window_hours = (
1242
+ reset_config.get("window_seconds", 0) / 3600
1243
+ if reset_config
1244
+ else 0
1245
+ )
1246
+ lib_logger.info(
1247
+ f"Started {window_hours:.1f}h window for model {model} on {mask_credential(key)}"
1248
+ )
1249
+
1250
+ # Record stats
1251
+ model_data["success_count"] += 1
1252
+ usage_data_ref = model_data # For token/cost recording below
1253
+
1254
  else:
1255
+ # Legacy credential-level structure
1256
  key_data = self._usage_data.setdefault(
1257
  key,
1258
  {
 
1262
  "failures": {},
1263
  },
1264
  )
 
1265
 
1266
+ if "last_daily_reset" not in key_data:
1267
+ key_data["last_daily_reset"] = today_utc_str
 
1268
 
1269
+ # Get or create model data in daily structure
1270
+ usage_data_ref = key_data["daily"]["models"].setdefault(
1271
+ model,
1272
+ {
1273
+ "success_count": 0,
1274
+ "prompt_tokens": 0,
1275
+ "completion_tokens": 0,
1276
+ "approx_cost": 0.0,
1277
+ },
1278
+ )
1279
+ usage_data_ref["success_count"] += 1
1280
+
1281
+ # Reset failures for this model
1282
  model_failures = key_data.setdefault("failures", {}).setdefault(model, {})
1283
  model_failures["consecutive_failures"] = 0
1284
+
1285
+ # Clear transient cooldown on success (but NOT quota_reset_ts)
1286
  if model in key_data.get("model_cooldowns", {}):
1287
  del key_data["model_cooldowns"][model]
1288
 
1289
+ # Record token and cost usage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1290
  if (
1291
  completion_response
1292
  and hasattr(completion_response, "usage")
1293
  and completion_response.usage
1294
  ):
1295
  usage = completion_response.usage
1296
+ usage_data_ref["prompt_tokens"] += usage.prompt_tokens
1297
+ usage_data_ref["completion_tokens"] += getattr(
1298
  usage, "completion_tokens", 0
1299
+ )
1300
  lib_logger.info(
1301
  f"Recorded usage from response object for key {mask_credential(key)}"
1302
  )
 
1304
  provider_name = model.split("/")[0]
1305
  provider_plugin = self.provider_plugins.get(provider_name)
1306
 
 
1307
  if provider_plugin and getattr(
1308
  provider_plugin, "skip_cost_calculation", False
1309
  ):
 
1311
  f"Skipping cost calculation for provider '{provider_name}' (custom provider)."
1312
  )
1313
  else:
 
1314
  if isinstance(completion_response, litellm.EmbeddingResponse):
 
1315
  model_info = litellm.get_model_info(model)
1316
  input_cost = model_info.get("input_cost_per_token")
1317
  if input_cost:
 
1326
  )
1327
 
1328
  if cost is not None:
1329
+ usage_data_ref["approx_cost"] += cost
1330
  except Exception as e:
1331
  lib_logger.warning(
1332
  f"Could not calculate cost for model {model}: {e}"
 
1334
  elif isinstance(completion_response, asyncio.Future) or hasattr(
1335
  completion_response, "__aiter__"
1336
  ):
1337
+ pass # Stream - usage recorded from chunks
 
1338
  else:
1339
  lib_logger.warning(
1340
  f"No usage data found in completion response for model {model}. Recording success without token count."
 
1351
  classified_error: ClassifiedError,
1352
  increment_consecutive_failures: bool = True,
1353
  ):
1354
+ """Records a failure and applies cooldowns based on error type.
1355
+
1356
+ Distinguishes between:
1357
+ - quota_exceeded: Long cooldown with exact reset time (from quota_reset_timestamp)
1358
+ Sets quota_reset_ts on model (and group) - this becomes authoritative stats reset time
1359
+ - rate_limit: Short transient cooldown (just wait and retry)
1360
+ Only sets model_cooldowns - does NOT affect stats reset timing
1361
 
1362
  Args:
1363
  key: The API key or credential identifier
 
1368
  """
1369
  await self._lazy_init()
1370
  async with self._data_lock:
1371
+ now_ts = time.time()
1372
  today_utc_str = datetime.now(timezone.utc).date().isoformat()
1373
 
 
 
1374
  reset_config = self._get_usage_reset_config(key)
1375
+ reset_mode = (
1376
+ reset_config.get("mode", "credential") if reset_config else "credential"
1377
+ )
1378
 
1379
  # Initialize key data with appropriate structure
1380
+ if reset_mode == "per_model":
1381
  key_data = self._usage_data.setdefault(
1382
  key,
1383
  {
1384
+ "models": {},
1385
  "global": {"models": {}},
1386
  "model_cooldowns": {},
1387
  "failures": {},
 
1409
 
1410
  # Calculate cooldown duration based on error type
1411
  cooldown_seconds = None
1412
+ model_cooldowns = key_data.setdefault("model_cooldowns", {})
1413
 
1414
+ if classified_error.error_type == "quota_exceeded":
1415
+ # Quota exhausted - use authoritative reset timestamp if available
1416
+ quota_reset_ts = classified_error.quota_reset_timestamp
1417
  cooldown_seconds = classified_error.retry_after or 60
1418
+
1419
+ if quota_reset_ts and reset_mode == "per_model":
1420
+ # Set quota_reset_ts on model - this becomes authoritative stats reset time
1421
+ models_data = key_data.setdefault("models", {})
1422
+ model_data = models_data.setdefault(
1423
+ model,
1424
+ {
1425
+ "window_start_ts": None,
1426
+ "quota_reset_ts": None,
1427
+ "success_count": 0,
1428
+ "prompt_tokens": 0,
1429
+ "completion_tokens": 0,
1430
+ "approx_cost": 0.0,
1431
+ },
1432
+ )
1433
+ model_data["quota_reset_ts"] = quota_reset_ts
1434
+
1435
+ # Apply to all models in the same quota group
1436
+ group = self._get_model_quota_group(key, model)
1437
+ if group:
1438
+ grouped_models = self._get_grouped_models(key, group)
1439
+ for grouped_model in grouped_models:
1440
+ group_model_data = models_data.setdefault(
1441
+ grouped_model,
1442
+ {
1443
+ "window_start_ts": None,
1444
+ "quota_reset_ts": None,
1445
+ "success_count": 0,
1446
+ "prompt_tokens": 0,
1447
+ "completion_tokens": 0,
1448
+ "approx_cost": 0.0,
1449
+ },
1450
+ )
1451
+ group_model_data["quota_reset_ts"] = quota_reset_ts
1452
+ # Also set transient cooldown for selection logic
1453
+ model_cooldowns[grouped_model] = quota_reset_ts
1454
+
1455
+ reset_dt = datetime.fromtimestamp(
1456
+ quota_reset_ts, tz=timezone.utc
1457
+ )
1458
  lib_logger.info(
1459
+ f"Quota exhausted for group '{group}' ({len(grouped_models)} models) "
1460
+ f"on {mask_credential(key)}. Resets at {reset_dt.isoformat()}"
1461
  )
1462
  else:
1463
+ reset_dt = datetime.fromtimestamp(
1464
+ quota_reset_ts, tz=timezone.utc
1465
+ )
1466
+ hours = (quota_reset_ts - now_ts) / 3600
1467
  lib_logger.info(
1468
+ f"Quota exhausted for model {model} on {mask_credential(key)}. "
1469
+ f"Resets at {reset_dt.isoformat()} ({hours:.1f}h)"
1470
  )
1471
+
1472
+ # Set transient cooldown for selection logic
1473
+ model_cooldowns[model] = quota_reset_ts
1474
  else:
1475
+ # No authoritative timestamp or legacy mode - just use retry_after
1476
+ model_cooldowns[model] = now_ts + cooldown_seconds
1477
+ hours = cooldown_seconds / 3600
1478
  lib_logger.info(
1479
+ f"Quota exhausted on {mask_credential(key)} for model {model}. "
1480
+ f"Cooldown: {cooldown_seconds}s ({hours:.1f}h)"
1481
  )
1482
+
1483
+ elif classified_error.error_type == "rate_limit":
1484
+ # Transient rate limit - just set short cooldown (does NOT set quota_reset_ts)
1485
+ cooldown_seconds = classified_error.retry_after or 60
1486
+ model_cooldowns[model] = now_ts + cooldown_seconds
1487
+ lib_logger.info(
1488
+ f"Rate limit on {mask_credential(key)} for model {model}. "
1489
+ f"Transient cooldown: {cooldown_seconds}s"
1490
+ )
1491
+
1492
  elif classified_error.error_type == "authentication":
1493
  # Apply a 5-minute key-level lockout for auth errors
1494
+ key_data["key_cooldown_until"] = now_ts + 300
1495
+ cooldown_seconds = 300
1496
+ model_cooldowns[model] = now_ts + cooldown_seconds
1497
  lib_logger.warning(
1498
  f"Authentication error on key {mask_credential(key)}. Applying 5-minute key-level lockout."
1499
  )
 
 
1500
 
1501
  # If we should increment failures, calculate escalating backoff
1502
  if should_increment:
 
1510
  # If cooldown wasn't set by specific error type, use escalating backoff
1511
  if cooldown_seconds is None:
1512
  backoff_tiers = {1: 10, 2: 30, 3: 60, 4: 120}
1513
+ cooldown_seconds = backoff_tiers.get(count, 7200)
1514
+ model_cooldowns[model] = now_ts + cooldown_seconds
 
1515
  lib_logger.warning(
1516
  f"Failure #{count} for key {mask_credential(key)} with model {model}. "
1517
+ f"Error type: {classified_error.error_type}, cooldown: {cooldown_seconds}s"
1518
  )
1519
  else:
1520
  # Provider-level errors: apply short cooldown but don't count against key
1521
  if cooldown_seconds is None:
1522
+ cooldown_seconds = 30
1523
+ model_cooldowns[model] = now_ts + cooldown_seconds
1524
  lib_logger.info(
1525
+ f"Provider-level error ({classified_error.error_type}) for key {mask_credential(key)} "
1526
+ f"with model {model}. NOT incrementing failures. Cooldown: {cooldown_seconds}s"
1527
  )
1528
 
 
 
 
 
 
 
 
 
1529
  # Check for key-level lockout condition
1530
  await self._check_key_lockout(key, key_data)
1531
 
1532
  key_data["last_failure"] = {
1533
+ "timestamp": now_ts,
1534
  "model": model,
1535
  "error": str(classified_error.original_exception),
1536
  }