Spaces:

elmerzole
/

llm-api-proxy

Paused

Mirrowel commited on Dec 6, 2025

Commit

0ca1651

1 Parent(s): 98f6823

feat(usage): ✨ implement per-model quota tracking with authoritative reset timestamps

This commit introduces granular per-model quota tracking that supports provider-specific reset timestamps from quota exhausted errors.

Key changes:

- Add `quota_reset_timestamp` field to `ClassifiedError` to capture authoritative Unix timestamp from provider's quota exhausted responses
- Implement per-model usage tracking mode where each model maintains its own window with `window_start_ts` and `quota_reset_ts`
- Add quota group support for models that share quota limits (e.g., Claude Sonnet and Opus on Antigravity)
- Parse Antigravity's `quotaResetTimeStamp` ISO format to Unix timestamp for precise reset timing
- Update reset logic to prioritize authoritative `quota_reset_ts` over fallback window calculations
- Distinguish between quota exhausted (sets authoritative reset time) and rate limit (transient cooldown only)
- Migrate Antigravity provider to per-model tracking with 5-hour windows for paid tier and 7-day windows for free tier

The per-model mode enables more accurate quota tracking by using exact reset times from provider error responses rather than estimated windows, preventing premature resets and improving credential utilization.

BREAKING CHANGE: Provider implementations using custom `get_usage_reset_config()` must now return a `mode` field ("per_model" or "credential") instead of `field_name`. The usage data structure has changed from `key_data["field_name"]["models"]` to `key_data["models"]` for per-model tracking. Existing usage data will be preserved but new tracking will use the updated structure.

Files changed (4) hide show

src/rotator_library/error_handler.py +15 -1
src/rotator_library/providers/antigravity_provider.py +78 -18
src/rotator_library/providers/provider_interface.py +58 -18
src/rotator_library/usage_manager.py +423 -111

src/rotator_library/error_handler.py CHANGED Viewed

@@ -347,14 +347,26 @@ class ClassifiedError:
         original_exception: Exception,
         status_code: Optional[int] = None,
         retry_after: Optional[int] = None,
     ):
         self.error_type = error_type
         self.original_exception = original_exception
         self.status_code = status_code
         self.retry_after = retry_after
     def __str__(self):
-        return f"ClassifiedError(type={self.error_type}, status={self.status_code}, retry_after={self.retry_after}, original_exc={self.original_exception})"
 def _extract_retry_from_json_body(json_text: str) -> Optional[int]:
@@ -567,6 +579,7 @@ def classify_error(e: Exception, provider: Optional[str] = None) -> ClassifiedEr
                     retry_after = quota_info["retry_after"]
                     reason = quota_info.get("reason", "QUOTA_EXHAUSTED")
                     reset_ts = quota_info.get("reset_timestamp")
                     # Log the parsed result with human-readable duration
                     hours = retry_after / 3600
@@ -581,6 +594,7 @@ def classify_error(e: Exception, provider: Optional[str] = None) -> ClassifiedEr
                         original_exception=e,
                         status_code=429,
                         retry_after=retry_after,
                     )
         except Exception as parse_error:
             lib_logger.debug(

         original_exception: Exception,
         status_code: Optional[int] = None,
         retry_after: Optional[int] = None,
+        quota_reset_timestamp: Optional[float] = None,
     ):
         self.error_type = error_type
         self.original_exception = original_exception
         self.status_code = status_code
         self.retry_after = retry_after
+        # Unix timestamp when quota resets (from quota_exhausted errors)
+        # This is the authoritative reset time parsed from provider's error response
+        self.quota_reset_timestamp = quota_reset_timestamp
     def __str__(self):
+        parts = [
+            f"type={self.error_type}",
+            f"status={self.status_code}",
+            f"retry_after={self.retry_after}",
+        ]
+        if self.quota_reset_timestamp:
+            parts.append(f"quota_reset_ts={self.quota_reset_timestamp}")
+        parts.append(f"original_exc={self.original_exception}")
+        return f"ClassifiedError({', '.join(parts)})"
 def _extract_retry_from_json_body(json_text: str) -> Optional[int]:
                     retry_after = quota_info["retry_after"]
                     reason = quota_info.get("reason", "QUOTA_EXHAUSTED")
                     reset_ts = quota_info.get("reset_timestamp")
+                    quota_reset_timestamp = quota_info.get("quota_reset_timestamp")
                     # Log the parsed result with human-readable duration
                     hours = retry_after / 3600
                         original_exception=e,
                         status_code=429,
                         retry_after=retry_after,
+                        quota_reset_timestamp=quota_reset_timestamp,
                     )
         except Exception as parse_error:
             lib_logger.debug(

src/rotator_library/providers/antigravity_provider.py CHANGED Viewed

@@ -600,6 +600,7 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
             "retry_after": None,
             "reason": None,
             "reset_timestamp": None,
         }
         for detail in details:
@@ -626,8 +627,22 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
                         if parsed:
                             result["retry_after"] = parsed
-                # Capture reset timestamp for logging
-                result["reset_timestamp"] = metadata.get("quotaResetTimeStamp")
         # Return None if we couldn't extract retry_after
         if not result["retry_after"]:
@@ -826,45 +841,48 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
         """
         Get Antigravity-specific usage tracking configuration based on credential tier.
-        Antigravity has different quota reset windows by tier:
-        - Paid tiers (priority 1): 5-hour rolling window
-        - Free tier (priority 2): 7-day rolling window
-        - Unknown/legacy: 7-day rolling window (conservative default)
         Args:
             credential: The credential path
         Returns:
-            Usage reset configuration dict
         """
         tier = self.project_tier_cache.get(credential)
         if not tier:
             tier = self._load_tier_from_file(credential)
-        # Paid tiers: 5-hour window
         if tier and tier not in ["free-tier", "legacy-tier", "unknown"]:
             return {
                 "window_seconds": 5 * 60 * 60,  # 18000 seconds = 5 hours
-                "field_name": "5h_window",
                 "priority": 1,
-                "description": "5-hour rolling window (paid tier)",
             }
-        # Free tier: 7-day window
         if tier == "free-tier":
             return {
                 "window_seconds": 7 * 24 * 60 * 60,  # 604800 seconds = 7 days
-                "field_name": "weekly",
                 "priority": 2,
-                "description": "7-day rolling window (free tier)",
             }
-        # Unknown/legacy: use 7-day window as conservative default
         return {
             "window_seconds": 7 * 24 * 60 * 60,  # 604800 seconds = 7 days
-            "field_name": "weekly",
             "priority": 10,
-            "description": "7-day rolling window (unknown tier - conservative default)",
         }
     def get_default_usage_field_name(self) -> str:
@@ -872,9 +890,51 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
         Get the default usage tracking field name for Antigravity.
         Returns:
-            "weekly" as the conservative default for unknown credentials
         """
-        return "weekly"
     async def initialize_credentials(self, credential_paths: List[str]) -> None:
         """

             "retry_after": None,
             "reason": None,
             "reset_timestamp": None,
+            "quota_reset_timestamp": None,  # Unix timestamp for quota reset
         }
         for detail in details:
                         if parsed:
                             result["retry_after"] = parsed
+                # Capture reset timestamp for logging and authoritative reset time
+                reset_ts_str = metadata.get("quotaResetTimeStamp")
+                result["reset_timestamp"] = reset_ts_str
+                # Parse ISO timestamp to Unix timestamp for usage tracking
+                if reset_ts_str:
+                    try:
+                        # Handle ISO format: "2025-12-11T22:53:16Z"
+                        reset_dt = datetime.fromisoformat(
+                            reset_ts_str.replace("Z", "+00:00")
+                        )
+                        result["quota_reset_timestamp"] = reset_dt.timestamp()
+                    except (ValueError, AttributeError) as e:
+                        lib_logger.warning(
+                            f"Failed to parse quota reset timestamp '{reset_ts_str}': {e}"
+                        )
         # Return None if we couldn't extract retry_after
         if not result["retry_after"]:
         """
         Get Antigravity-specific usage tracking configuration based on credential tier.
+        Antigravity uses per-model windows with different durations by tier:
+        - Paid tiers (priority 1): 5-hour per-model window
+        - Free tier (priority 2): 7-day per-model window
+        - Unknown/legacy: 7-day per-model window (conservative default)
+        When a model hits a quota_exhausted 429 error with exact reset timestamp,
+        that timestamp becomes the authoritative reset time for the model (and its group).
         Args:
             credential: The credential path
         Returns:
+            Usage reset configuration dict with mode="per_model"
         """
         tier = self.project_tier_cache.get(credential)
         if not tier:
             tier = self._load_tier_from_file(credential)
+        # Paid tiers: 5-hour per-model window
         if tier and tier not in ["free-tier", "legacy-tier", "unknown"]:
             return {
                 "window_seconds": 5 * 60 * 60,  # 18000 seconds = 5 hours
+                "mode": "per_model",
                 "priority": 1,
+                "description": "5-hour per-model window (paid tier)",
             }
+        # Free tier: 7-day per-model window
         if tier == "free-tier":
             return {
                 "window_seconds": 7 * 24 * 60 * 60,  # 604800 seconds = 7 days
+                "mode": "per_model",
                 "priority": 2,
+                "description": "7-day per-model window (free tier)",
             }
+        # Unknown/legacy: use 7-day per-model window as conservative default
         return {
             "window_seconds": 7 * 24 * 60 * 60,  # 604800 seconds = 7 days
+            "mode": "per_model",
             "priority": 10,
+            "description": "7-day per-model window (unknown tier - conservative default)",
         }
     def get_default_usage_field_name(self) -> str:
         Get the default usage tracking field name for Antigravity.
         Returns:
+            "models" for per-model tracking
+        """
+        return "models"
+    # =========================================================================
+    # Model Quota Grouping
+    # =========================================================================
+    # Models that share quota timing - when one hits quota, all get same reset time
+    QUOTA_GROUPS = {
+        # Future: add claude/gemini groups if they share quota
+    }
+    def get_model_quota_group(self, model: str) -> Optional[str]:
+        """
+        Returns the quota group name for a model.
+        Claude models (sonnet and opus) share quota on Antigravity.
+        When one hits quota exhausted, all models in the group get the same reset time.
+        Args:
+            model: Model name (with or without "antigravity/" prefix)
+        Returns:
+            Group name ("claude") or None if not grouped
+        """
+        # Remove provider prefix if present
+        clean_model = model.replace("antigravity/", "")
+        for group_name, models in self.QUOTA_GROUPS.items():
+            if clean_model in models:
+                return group_name
+        return None
+    def get_models_in_quota_group(self, group: str) -> List[str]:
+        """
+        Returns all model names in a quota group.
+        Args:
+            group: Group name (e.g., "claude")
+        Returns:
+            List of model names (without provider prefix)
         """
+        return self.QUOTA_GROUPS.get(group, [])
     async def initialize_credentials(self, credential_paths: List[str]) -> None:
         """

src/rotator_library/providers/provider_interface.py CHANGED Viewed

@@ -202,6 +202,7 @@ class ProviderInterface(ABC):
                 "retry_after": int,  # seconds until quota resets
                 "reason": str,       # e.g., "QUOTA_EXHAUSTED", "RATE_LIMITED"
                 "reset_timestamp": str | None,  # ISO timestamp if available
             }
         """
         return None  # Default: no provider-specific parsing
@@ -218,9 +219,9 @@ class ProviderInterface(ABC):
         credential tier (e.g., paid vs free accounts with different quota periods).
         The UsageManager will use this configuration to:
-        1. Track usage in a custom-named field (instead of default "daily")
-        2. Reset usage based on a rolling window from first request
-        3. Archive stats to "global" when the window expires
         Args:
             credential: The credential identifier (API key or path)
@@ -229,32 +230,35 @@ class ProviderInterface(ABC):
             None to use default daily reset, otherwise a dict with:
             {
                 "window_seconds": int,     # Duration in seconds (e.g., 18000 for 5h)
-                "field_name": str,         # Custom field name (e.g., "5h_window", "weekly")
-                "priority": int,           # Priority level this config applies to (for docs)
                 "description": str,        # Human-readable description (for logging)
             }
         Examples:
-            Antigravity paid tier:
             {
                 "window_seconds": 18000,   # 5 hours
-                "field_name": "5h_window",
                 "priority": 1,
-                "description": "5-hour rolling window (paid tier)"
             }
-            Antigravity free tier:
             {
-                "window_seconds": 604800,  # 7 days
-                "field_name": "weekly",
-                "priority": 2,
-                "description": "7-day rolling window (free tier)"
             }
-        Note:
-            - window_seconds: Time from first request until stats reset
-            - When window expires, stats move to "global" (same as daily reset)
-            - First request after window expiry starts a new window
         """
         return None  # Default: use daily reset at daily_reset_time_utc
@@ -269,3 +273,39 @@ class ProviderInterface(ABC):
             Field name string (default: "daily")
         """
         return "daily"

                 "retry_after": int,  # seconds until quota resets
                 "reason": str,       # e.g., "QUOTA_EXHAUSTED", "RATE_LIMITED"
                 "reset_timestamp": str | None,  # ISO timestamp if available
+                "quota_reset_timestamp": float | None,  # Unix timestamp for quota reset
             }
         """
         return None  # Default: no provider-specific parsing
         credential tier (e.g., paid vs free accounts with different quota periods).
         The UsageManager will use this configuration to:
+        1. Track usage per-model or per-credential based on mode
+        2. Reset usage based on a rolling window OR quota exhausted timestamp
+        3. Archive stats to "global" when the window/quota expires
         Args:
             credential: The credential identifier (API key or path)
             None to use default daily reset, otherwise a dict with:
             {
                 "window_seconds": int,     # Duration in seconds (e.g., 18000 for 5h)
+                "mode": str,               # "credential" or "per_model"
+                "priority": int,           # Priority level this config applies to
                 "description": str,        # Human-readable description (for logging)
             }
+        Modes:
+            - "credential": One window per credential. Window starts from first
+              request of ANY model. All models reset together when window expires.
+            - "per_model": Separate window per model (or model group). Window starts
+              from first request of THAT model. Models reset independently unless
+              grouped. If a quota_exhausted error provides exact reset time, that
+              becomes the authoritative reset time for the model.
         Examples:
+            Antigravity paid tier (per-model):
             {
                 "window_seconds": 18000,   # 5 hours
+                "mode": "per_model",
                 "priority": 1,
+                "description": "5-hour per-model window (paid tier)"
             }
+            Default provider (credential-level):
             {
+                "window_seconds": 86400,   # 24 hours
+                "mode": "credential",
+                "priority": 1,
+                "description": "24-hour credential window"
             }
         """
         return None  # Default: use daily reset at daily_reset_time_utc
             Field name string (default: "daily")
         """
         return "daily"
+    # =========================================================================
+    # Model Quota Grouping
+    # =========================================================================
+    def get_model_quota_group(self, model: str) -> Optional[str]:
+        """
+        Returns the quota group name for a model, or None if not grouped.
+        Models in the same quota group share cooldown timing - when one model
+        hits a quota exhausted error, all models in the group get the same
+        reset timestamp. They also reset (archive stats) together.
+        This is useful for providers where multiple model variants share the
+        same underlying quota (e.g., Claude Sonnet and Opus on Antigravity).
+        Args:
+            model: Model name (with or without provider prefix)
+        Returns:
+            Group name string (e.g., "claude") or None if model is not grouped
+        """
+        return None
+    def get_models_in_quota_group(self, group: str) -> List[str]:
+        """
+        Returns all model names that belong to a quota group.
+        Args:
+            group: Group name (e.g., "claude")
+        Returns:
+            List of model names (WITHOUT provider prefix) in the group.
+            Empty list if group doesn't exist.
+        """
+        return []

src/rotator_library/usage_manager.py CHANGED Viewed

@@ -162,6 +162,69 @@ class UsageManager:
         return None
     def _get_usage_field_name(self, credential: str) -> str:
         """
         Get the usage tracking field name for a credential.
@@ -190,27 +253,36 @@ class UsageManager:
     def _get_usage_count(self, key: str, model: str) -> int:
         """
-        Get the current usage count for a model from the appropriate usage field.
         Args:
             key: Credential identifier
             model: Model name
         Returns:
-            Usage count (success_count) for the model in the current window/daily period
         """
         if self._usage_data is None:
             return 0
         key_data = self._usage_data.get(key, {})
-        usage_field = self._get_usage_field_name(key)
-        return (
-            key_data.get(usage_field, {})
-            .get("models", {})
-            .get(model, {})
-            .get("success_count", 0)
-        )
     def _select_sequential(
         self,
@@ -299,9 +371,10 @@ class UsageManager:
         """
         Checks if usage stats need to be reset for any key.
-        Supports two reset modes:
-        1. Provider-specific rolling windows (e.g., 5h for Antigravity paid, 7d for free)
-        2. Legacy daily reset at daily_reset_time_utc for providers without custom config
         """
         if self._usage_data is None:
             return
@@ -312,16 +385,23 @@ class UsageManager:
         needs_saving = False
         for key, data in self._usage_data.items():
-            # Check for provider-specific reset configuration
             reset_config = self._get_usage_reset_config(key)
             if reset_config:
-                # Provider-specific rolling window reset
-                needs_saving |= await self._check_window_reset(
-                    key, data, reset_config, now_ts
-                )
             elif self.daily_reset_time_utc:
-                # Legacy daily reset for providers without custom config
                 needs_saving |= await self._check_daily_reset(
                     key, data, now_utc, today_str, now_ts
                 )
@@ -329,6 +409,170 @@ class UsageManager:
         if needs_saving:
             await self._save_usage()
     async def _check_window_reset(
         self,
         key: str,
@@ -948,36 +1192,67 @@ class UsageManager:
         Records a successful API call, resetting failure counters.
         It safely handles cases where token usage data is not available.
-        Uses provider-specific field names for usage tracking (e.g., "5h_window", "weekly")
-        and sets window start timestamp on first request.
         """
         await self._lazy_init()
         async with self._data_lock:
             now_ts = time.time()
             today_utc_str = datetime.now(timezone.utc).date().isoformat()
-            # Determine the usage field name for this credential
-            usage_field = self._get_usage_field_name(key)
             reset_config = self._get_usage_reset_config(key)
-            uses_window = reset_config is not None
-            # Initialize key data with appropriate structure
-            if uses_window:
-                # Provider-specific rolling window
                 key_data = self._usage_data.setdefault(
                     key,
                     {
-                        usage_field: {"start_ts": None, "models": {}},
                         "global": {"models": {}},
                         "model_cooldowns": {},
                         "failures": {},
                     },
                 )
-                # Ensure the usage field exists (for migration from old format)
-                if usage_field not in key_data:
-                    key_data[usage_field] = {"start_ts": None, "models": {}}
             else:
-                # Legacy daily reset
                 key_data = self._usage_data.setdefault(
                     key,
                     {
@@ -987,57 +1262,41 @@ class UsageManager:
                         "failures": {},
                     },
                 )
-                usage_field = "daily"
-            # If the key is new, ensure its reset date is initialized to prevent an immediate reset.
-            if not uses_window and "last_daily_reset" not in key_data:
-                key_data["last_daily_reset"] = today_utc_str
-            # Always record a success and reset failures
             model_failures = key_data.setdefault("failures", {}).setdefault(model, {})
             model_failures["consecutive_failures"] = 0
             if model in key_data.get("model_cooldowns", {}):
                 del key_data["model_cooldowns"][model]
-            # Get or create the usage field data
-            usage_data = key_data.setdefault(usage_field, {"models": {}})
-            # For window-based tracking, set start_ts on first request
-            if uses_window:
-                if usage_data.get("start_ts") is None:
-                    usage_data["start_ts"] = now_ts
-                    window_hours = reset_config.get("window_seconds", 0) / 3600
-                    description = reset_config.get("description", "rolling window")
-                    lib_logger.info(
-                        f"Starting new {window_hours:.1f}h window for {mask_credential(key)} - {description}"
-                    )
-            # Ensure models dict exists
-            if "models" not in usage_data:
-                usage_data["models"] = {}
-            model_data = usage_data["models"].setdefault(
-                model,
-                {
-                    "success_count": 0,
-                    "prompt_tokens": 0,
-                    "completion_tokens": 0,
-                    "approx_cost": 0.0,
-                },
-            )
-            model_data["success_count"] += 1
-            # Safely attempt to record token and cost usage
             if (
                 completion_response
                 and hasattr(completion_response, "usage")
                 and completion_response.usage
             ):
                 usage = completion_response.usage
-                model_data["prompt_tokens"] += usage.prompt_tokens
-                model_data["completion_tokens"] += getattr(
                     usage, "completion_tokens", 0
-                )  # Not present in embedding responses
                 lib_logger.info(
                     f"Recorded usage from response object for key {mask_credential(key)}"
                 )
@@ -1045,7 +1304,6 @@ class UsageManager:
                     provider_name = model.split("/")[0]
                     provider_plugin = self.provider_plugins.get(provider_name)
-                    # Check class attribute directly - no need to instantiate
                     if provider_plugin and getattr(
                         provider_plugin, "skip_cost_calculation", False
                     ):
@@ -1053,9 +1311,7 @@ class UsageManager:
                             f"Skipping cost calculation for provider '{provider_name}' (custom provider)."
                         )
                     else:
-                        # Differentiate cost calculation based on response type
                         if isinstance(completion_response, litellm.EmbeddingResponse):
-                            # Manually calculate cost for embeddings
                             model_info = litellm.get_model_info(model)
                             input_cost = model_info.get("input_cost_per_token")
                             if input_cost:
@@ -1070,7 +1326,7 @@ class UsageManager:
                             )
                         if cost is not None:
-                            model_data["approx_cost"] += cost
                 except Exception as e:
                     lib_logger.warning(
                         f"Could not calculate cost for model {model}: {e}"
@@ -1078,8 +1334,7 @@ class UsageManager:
             elif isinstance(completion_response, asyncio.Future) or hasattr(
                 completion_response, "__aiter__"
             ):
-                # This is an unconsumed stream object. Do not log a warning, as usage will be recorded from the chunks.
-                pass
             else:
                 lib_logger.warning(
                     f"No usage data found in completion response for model {model}. Recording success without token count."
@@ -1096,7 +1351,13 @@ class UsageManager:
         classified_error: ClassifiedError,
         increment_consecutive_failures: bool = True,
     ):
-        """Records a failure and applies cooldowns based on an escalating backoff strategy.
         Args:
             key: The API key or credential identifier
@@ -1107,19 +1368,20 @@ class UsageManager:
         """
         await self._lazy_init()
         async with self._data_lock:
             today_utc_str = datetime.now(timezone.utc).date().isoformat()
-            # Determine the usage field name for this credential
-            usage_field = self._get_usage_field_name(key)
             reset_config = self._get_usage_reset_config(key)
-            uses_window = reset_config is not None
             # Initialize key data with appropriate structure
-            if uses_window:
                 key_data = self._usage_data.setdefault(
                     key,
                     {
-                        usage_field: {"start_ts": None, "models": {}},
                         "global": {"models": {}},
                         "model_cooldowns": {},
                         "failures": {},
@@ -1147,36 +1409,94 @@ class UsageManager:
             # Calculate cooldown duration based on error type
             cooldown_seconds = None
-            if classified_error.error_type in ["rate_limit", "quota_exceeded"]:
-                # Rate limit / Quota errors: use retry_after if available, otherwise default to 60s
                 cooldown_seconds = classified_error.retry_after or 60
-                if classified_error.retry_after:
-                    # Log with human-readable duration for provider-parsed cooldowns
-                    hours = cooldown_seconds / 3600
-                    if hours >= 1:
                         lib_logger.info(
-                            f"Quota/rate limit on key {mask_credential(key)} for model {model}. "
-                            f"Applying provider-specified cooldown: {cooldown_seconds}s ({hours:.1f}h)"
                         )
                     else:
                         lib_logger.info(
-                            f"Rate limit on key {mask_credential(key)} for model {model}. "
-                            f"Applying provider-specified cooldown: {cooldown_seconds}s"
                         )
                 else:
                     lib_logger.info(
-                        f"Rate limit on key {mask_credential(key)} for model {model}. "
-                        f"Using default cooldown: {cooldown_seconds}s"
                     )
             elif classified_error.error_type == "authentication":
                 # Apply a 5-minute key-level lockout for auth errors
-                key_data["key_cooldown_until"] = time.time() + 300
                 lib_logger.warning(
                     f"Authentication error on key {mask_credential(key)}. Applying 5-minute key-level lockout."
                 )
-                # Auth errors still use escalating backoff for the specific model
-                cooldown_seconds = 300  # 5 minutes for model cooldown
             # If we should increment failures, calculate escalating backoff
             if should_increment:
@@ -1190,35 +1510,27 @@ class UsageManager:
                 # If cooldown wasn't set by specific error type, use escalating backoff
                 if cooldown_seconds is None:
                     backoff_tiers = {1: 10, 2: 30, 3: 60, 4: 120}
-                    cooldown_seconds = backoff_tiers.get(
-                        count, 7200
-                    )  # Default to 2 hours for "spent" keys
                     lib_logger.warning(
                         f"Failure #{count} for key {mask_credential(key)} with model {model}. "
-                        f"Error type: {classified_error.error_type}"
                     )
             else:
                 # Provider-level errors: apply short cooldown but don't count against key
                 if cooldown_seconds is None:
-                    cooldown_seconds = 30  # 30s cooldown for provider issues
                 lib_logger.info(
-                    f"Provider-level error ({classified_error.error_type}) for key {mask_credential(key)} with model {model}. "
-                    f"NOT incrementing consecutive failures. Applying {cooldown_seconds}s cooldown."
                 )
-            # Apply the cooldown
-            model_cooldowns = key_data.setdefault("model_cooldowns", {})
-            model_cooldowns[model] = time.time() + cooldown_seconds
-            lib_logger.warning(
-                f"Cooldown applied for key {mask_credential(key)} with model {model}: {cooldown_seconds}s. "
-                f"Error type: {classified_error.error_type}"
-            )
             # Check for key-level lockout condition
             await self._check_key_lockout(key, key_data)
             key_data["last_failure"] = {
-                "timestamp": time.time(),
                 "model": model,
                 "error": str(classified_error.original_exception),
             }

         return None
+    def _get_reset_mode(self, credential: str) -> str:
+        """
+        Get the reset mode for a credential: 'credential' or 'per_model'.
+        Args:
+            credential: The credential identifier
+        Returns:
+            "per_model" or "credential" (default)
+        """
+        config = self._get_usage_reset_config(credential)
+        return config.get("mode", "credential") if config else "credential"
+    def _get_model_quota_group(self, credential: str, model: str) -> Optional[str]:
+        """
+        Get the quota group for a model, if the provider defines one.
+        Args:
+            credential: The credential identifier
+            model: Model name (with or without provider prefix)
+        Returns:
+            Group name (e.g., "claude") or None if not grouped
+        """
+        provider = self._get_provider_from_credential(credential)
+        if not provider:
+            return None
+        plugin = self.provider_plugins.get(provider)
+        if not plugin:
+            return None
+        if hasattr(plugin, "get_model_quota_group"):
+            return plugin.get_model_quota_group(model)
+        return None
+    def _get_grouped_models(self, credential: str, group: str) -> List[str]:
+        """
+        Get all model names in a quota group (with provider prefix).
+        Args:
+            credential: The credential identifier
+            group: Group name (e.g., "claude")
+        Returns:
+            List of full model names (e.g., ["antigravity/claude-opus-4-5", ...])
+        """
+        provider = self._get_provider_from_credential(credential)
+        if not provider:
+            return []
+        plugin = self.provider_plugins.get(provider)
+        if not plugin:
+            return []
+        if hasattr(plugin, "get_models_in_quota_group"):
+            models = plugin.get_models_in_quota_group(group)
+            # Add provider prefix
+            return [f"{provider}/{m}" for m in models]
+        return []
     def _get_usage_field_name(self, credential: str) -> str:
         """
         Get the usage tracking field name for a credential.
     def _get_usage_count(self, key: str, model: str) -> int:
         """
+        Get the current usage count for a model from the appropriate usage structure.
+        Supports both:
+        - New per-model structure: {"models": {"model_name": {"success_count": N, ...}}}
+        - Legacy structure: {"daily": {"models": {"model_name": {"success_count": N, ...}}}}
         Args:
             key: Credential identifier
             model: Model name
         Returns:
+            Usage count (success_count) for the model in the current window/period
         """
         if self._usage_data is None:
             return 0
         key_data = self._usage_data.get(key, {})
+        reset_mode = self._get_reset_mode(key)
+        if reset_mode == "per_model":
+            # New per-model structure: key_data["models"][model]["success_count"]
+            return key_data.get("models", {}).get(model, {}).get("success_count", 0)
+        else:
+            # Legacy structure: key_data["daily"]["models"][model]["success_count"]
+            return (
+                key_data.get("daily", {})
+                .get("models", {})
+                .get(model, {})
+                .get("success_count", 0)
+            )
     def _select_sequential(
         self,
         """
         Checks if usage stats need to be reset for any key.
+        Supports three reset modes:
+        1. per_model: Each model has its own window, resets based on quota_reset_ts or fallback window
+        2. credential: One window per credential (legacy with custom window duration)
+        3. daily: Legacy daily reset at daily_reset_time_utc
         """
         if self._usage_data is None:
             return
         needs_saving = False
         for key, data in self._usage_data.items():
             reset_config = self._get_usage_reset_config(key)
             if reset_config:
+                reset_mode = reset_config.get("mode", "credential")
+                if reset_mode == "per_model":
+                    # Per-model window reset
+                    needs_saving |= await self._check_per_model_resets(
+                        key, data, reset_config, now_ts
+                    )
+                else:
+                    # Credential-level window reset (legacy)
+                    needs_saving |= await self._check_window_reset(
+                        key, data, reset_config, now_ts
+                    )
             elif self.daily_reset_time_utc:
+                # Legacy daily reset
                 needs_saving |= await self._check_daily_reset(
                     key, data, now_utc, today_str, now_ts
                 )
         if needs_saving:
             await self._save_usage()
+    async def _check_per_model_resets(
+        self,
+        key: str,
+        data: Dict[str, Any],
+        reset_config: Dict[str, Any],
+        now_ts: float,
+    ) -> bool:
+        """
+        Check and perform per-model resets for a credential.
+        Each model resets independently based on:
+        1. quota_reset_ts (authoritative, from quota exhausted error) if set
+        2. window_start_ts + window_seconds (fallback) otherwise
+        Grouped models reset together - all models in a group must be ready.
+        Args:
+            key: Credential identifier
+            data: Usage data for this credential
+            reset_config: Provider's reset configuration
+            now_ts: Current timestamp
+        Returns:
+            True if data was modified and needs saving
+        """
+        window_seconds = reset_config.get("window_seconds", 86400)
+        models_data = data.get("models", {})
+        if not models_data:
+            return False
+        modified = False
+        processed_groups = set()
+        for model, model_data in list(models_data.items()):
+            # Check if this model is in a quota group
+            group = self._get_model_quota_group(key, model)
+            if group:
+                if group in processed_groups:
+                    continue  # Already handled this group
+                # Check if entire group should reset
+                if self._should_group_reset(
+                    key, group, models_data, window_seconds, now_ts
+                ):
+                    # Archive and reset all models in group
+                    grouped_models = self._get_grouped_models(key, group)
+                    archived_count = 0
+                    for grouped_model in grouped_models:
+                        if grouped_model in models_data:
+                            gm_data = models_data[grouped_model]
+                            self._archive_model_to_global(data, grouped_model, gm_data)
+                            self._reset_model_data(gm_data)
+                            archived_count += 1
+                    if archived_count > 0:
+                        lib_logger.info(
+                            f"Reset model group '{group}' ({archived_count} models) for {mask_credential(key)}"
+                        )
+                        modified = True
+                processed_groups.add(group)
+            else:
+                # Ungrouped model - check individually
+                if self._should_model_reset(model_data, window_seconds, now_ts):
+                    self._archive_model_to_global(data, model, model_data)
+                    self._reset_model_data(model_data)
+                    lib_logger.info(f"Reset model {model} for {mask_credential(key)}")
+                    modified = True
+        # Preserve unexpired cooldowns
+        if modified:
+            self._preserve_unexpired_cooldowns(key, data, now_ts)
+            if "failures" in data:
+                data["failures"] = {}
+        return modified
+    def _should_model_reset(
+        self, model_data: Dict[str, Any], window_seconds: int, now_ts: float
+    ) -> bool:
+        """
+        Check if a single model should reset.
+        Returns True if:
+        - quota_reset_ts is set AND now >= quota_reset_ts, OR
+        - quota_reset_ts is NOT set AND now >= window_start_ts + window_seconds
+        """
+        quota_reset = model_data.get("quota_reset_ts")
+        window_start = model_data.get("window_start_ts")
+        if quota_reset:
+            return now_ts >= quota_reset
+        elif window_start:
+            return now_ts >= window_start + window_seconds
+        return False
+    def _should_group_reset(
+        self,
+        key: str,
+        group: str,
+        models_data: Dict[str, Dict],
+        window_seconds: int,
+        now_ts: float,
+    ) -> bool:
+        """
+        Check if all models in a group should reset.
+        All models in the group must be ready to reset.
+        If any model has an active cooldown/window, the whole group waits.
+        """
+        grouped_models = self._get_grouped_models(key, group)
+        # Track if any model in group has data
+        any_has_data = False
+        for grouped_model in grouped_models:
+            model_data = models_data.get(grouped_model, {})
+            if not model_data or (
+                model_data.get("window_start_ts") is None
+                and model_data.get("success_count", 0) == 0
+            ):
+                continue  # No stats for this model yet
+            any_has_data = True
+            if not self._should_model_reset(model_data, window_seconds, now_ts):
+                return False  # At least one model not ready
+        return any_has_data
+    def _archive_model_to_global(
+        self, data: Dict[str, Any], model: str, model_data: Dict[str, Any]
+    ) -> None:
+        """Archive a single model's stats to global."""
+        global_data = data.setdefault("global", {"models": {}})
+        global_model = global_data["models"].setdefault(
+            model,
+            {
+                "success_count": 0,
+                "prompt_tokens": 0,
+                "completion_tokens": 0,
+                "approx_cost": 0.0,
+            },
+        )
+        global_model["success_count"] += model_data.get("success_count", 0)
+        global_model["prompt_tokens"] += model_data.get("prompt_tokens", 0)
+        global_model["completion_tokens"] += model_data.get("completion_tokens", 0)
+        global_model["approx_cost"] += model_data.get("approx_cost", 0.0)
+    def _reset_model_data(self, model_data: Dict[str, Any]) -> None:
+        """Reset a model's window and stats."""
+        model_data["window_start_ts"] = None
+        model_data["quota_reset_ts"] = None
+        model_data["success_count"] = 0
+        model_data["prompt_tokens"] = 0
+        model_data["completion_tokens"] = 0
+        model_data["approx_cost"] = 0.0
     async def _check_window_reset(
         self,
         key: str,
         Records a successful API call, resetting failure counters.
         It safely handles cases where token usage data is not available.
+        Supports two modes based on provider configuration:
+        - per_model: Each model has its own window_start_ts and stats in key_data["models"]
+        - credential: Legacy mode with key_data["daily"]["models"]
         """
         await self._lazy_init()
         async with self._data_lock:
             now_ts = time.time()
             today_utc_str = datetime.now(timezone.utc).date().isoformat()
             reset_config = self._get_usage_reset_config(key)
+            reset_mode = (
+                reset_config.get("mode", "credential") if reset_config else "credential"
+            )
+            if reset_mode == "per_model":
+                # New per-model structure
                 key_data = self._usage_data.setdefault(
                     key,
                     {
+                        "models": {},
                         "global": {"models": {}},
                         "model_cooldowns": {},
                         "failures": {},
                     },
                 )
+                # Ensure models dict exists
+                if "models" not in key_data:
+                    key_data["models"] = {}
+                # Get or create per-model data with window tracking
+                model_data = key_data["models"].setdefault(
+                    model,
+                    {
+                        "window_start_ts": None,
+                        "quota_reset_ts": None,
+                        "success_count": 0,
+                        "prompt_tokens": 0,
+                        "completion_tokens": 0,
+                        "approx_cost": 0.0,
+                    },
+                )
+                # Start window on first request for this model
+                if model_data.get("window_start_ts") is None:
+                    model_data["window_start_ts"] = now_ts
+                    window_hours = (
+                        reset_config.get("window_seconds", 0) / 3600
+                        if reset_config
+                        else 0
+                    )
+                    lib_logger.info(
+                        f"Started {window_hours:.1f}h window for model {model} on {mask_credential(key)}"
+                    )
+                # Record stats
+                model_data["success_count"] += 1
+                usage_data_ref = model_data  # For token/cost recording below
             else:
+                # Legacy credential-level structure
                 key_data = self._usage_data.setdefault(
                     key,
                     {
                         "failures": {},
                     },
                 )
+                if "last_daily_reset" not in key_data:
+                    key_data["last_daily_reset"] = today_utc_str
+                # Get or create model data in daily structure
+                usage_data_ref = key_data["daily"]["models"].setdefault(
+                    model,
+                    {
+                        "success_count": 0,
+                        "prompt_tokens": 0,
+                        "completion_tokens": 0,
+                        "approx_cost": 0.0,
+                    },
+                )
+                usage_data_ref["success_count"] += 1
+            # Reset failures for this model
             model_failures = key_data.setdefault("failures", {}).setdefault(model, {})
             model_failures["consecutive_failures"] = 0
+            # Clear transient cooldown on success (but NOT quota_reset_ts)
             if model in key_data.get("model_cooldowns", {}):
                 del key_data["model_cooldowns"][model]
+            # Record token and cost usage
             if (
                 completion_response
                 and hasattr(completion_response, "usage")
                 and completion_response.usage
             ):
                 usage = completion_response.usage
+                usage_data_ref["prompt_tokens"] += usage.prompt_tokens
+                usage_data_ref["completion_tokens"] += getattr(
                     usage, "completion_tokens", 0
+                )
                 lib_logger.info(
                     f"Recorded usage from response object for key {mask_credential(key)}"
                 )
                     provider_name = model.split("/")[0]
                     provider_plugin = self.provider_plugins.get(provider_name)
                     if provider_plugin and getattr(
                         provider_plugin, "skip_cost_calculation", False
                     ):
                             f"Skipping cost calculation for provider '{provider_name}' (custom provider)."
                         )
                     else:
                         if isinstance(completion_response, litellm.EmbeddingResponse):
                             model_info = litellm.get_model_info(model)
                             input_cost = model_info.get("input_cost_per_token")
                             if input_cost:
                             )
                         if cost is not None:
+                            usage_data_ref["approx_cost"] += cost
                 except Exception as e:
                     lib_logger.warning(
                         f"Could not calculate cost for model {model}: {e}"
             elif isinstance(completion_response, asyncio.Future) or hasattr(
                 completion_response, "__aiter__"
             ):
+                pass  # Stream - usage recorded from chunks
             else:
                 lib_logger.warning(
                     f"No usage data found in completion response for model {model}. Recording success without token count."
         classified_error: ClassifiedError,
         increment_consecutive_failures: bool = True,
     ):
+        """Records a failure and applies cooldowns based on error type.
+        Distinguishes between:
+        - quota_exceeded: Long cooldown with exact reset time (from quota_reset_timestamp)
+          Sets quota_reset_ts on model (and group) - this becomes authoritative stats reset time
+        - rate_limit: Short transient cooldown (just wait and retry)
+          Only sets model_cooldowns - does NOT affect stats reset timing
         Args:
             key: The API key or credential identifier
         """
         await self._lazy_init()
         async with self._data_lock:
+            now_ts = time.time()
             today_utc_str = datetime.now(timezone.utc).date().isoformat()
             reset_config = self._get_usage_reset_config(key)
+            reset_mode = (
+                reset_config.get("mode", "credential") if reset_config else "credential"
+            )
             # Initialize key data with appropriate structure
+            if reset_mode == "per_model":
                 key_data = self._usage_data.setdefault(
                     key,
                     {
+                        "models": {},
                         "global": {"models": {}},
                         "model_cooldowns": {},
                         "failures": {},
             # Calculate cooldown duration based on error type
             cooldown_seconds = None
+            model_cooldowns = key_data.setdefault("model_cooldowns", {})
+            if classified_error.error_type == "quota_exceeded":
+                # Quota exhausted - use authoritative reset timestamp if available
+                quota_reset_ts = classified_error.quota_reset_timestamp
                 cooldown_seconds = classified_error.retry_after or 60
+                if quota_reset_ts and reset_mode == "per_model":
+                    # Set quota_reset_ts on model - this becomes authoritative stats reset time
+                    models_data = key_data.setdefault("models", {})
+                    model_data = models_data.setdefault(
+                        model,
+                        {
+                            "window_start_ts": None,
+                            "quota_reset_ts": None,
+                            "success_count": 0,
+                            "prompt_tokens": 0,
+                            "completion_tokens": 0,
+                            "approx_cost": 0.0,
+                        },
+                    )
+                    model_data["quota_reset_ts"] = quota_reset_ts
+                    # Apply to all models in the same quota group
+                    group = self._get_model_quota_group(key, model)
+                    if group:
+                        grouped_models = self._get_grouped_models(key, group)
+                        for grouped_model in grouped_models:
+                            group_model_data = models_data.setdefault(
+                                grouped_model,
+                                {
+                                    "window_start_ts": None,
+                                    "quota_reset_ts": None,
+                                    "success_count": 0,
+                                    "prompt_tokens": 0,
+                                    "completion_tokens": 0,
+                                    "approx_cost": 0.0,
+                                },
+                            )
+                            group_model_data["quota_reset_ts"] = quota_reset_ts
+                            # Also set transient cooldown for selection logic
+                            model_cooldowns[grouped_model] = quota_reset_ts
+                        reset_dt = datetime.fromtimestamp(
+                            quota_reset_ts, tz=timezone.utc
+                        )
                         lib_logger.info(
+                            f"Quota exhausted for group '{group}' ({len(grouped_models)} models) "
+                            f"on {mask_credential(key)}. Resets at {reset_dt.isoformat()}"
                         )
                     else:
+                        reset_dt = datetime.fromtimestamp(
+                            quota_reset_ts, tz=timezone.utc
+                        )
+                        hours = (quota_reset_ts - now_ts) / 3600
                         lib_logger.info(
+                            f"Quota exhausted for model {model} on {mask_credential(key)}. "
+                            f"Resets at {reset_dt.isoformat()} ({hours:.1f}h)"
                         )
+                    # Set transient cooldown for selection logic
+                    model_cooldowns[model] = quota_reset_ts
                 else:
+                    # No authoritative timestamp or legacy mode - just use retry_after
+                    model_cooldowns[model] = now_ts + cooldown_seconds
+                    hours = cooldown_seconds / 3600
                     lib_logger.info(
+                        f"Quota exhausted on {mask_credential(key)} for model {model}. "
+                        f"Cooldown: {cooldown_seconds}s ({hours:.1f}h)"
                     )
+            elif classified_error.error_type == "rate_limit":
+                # Transient rate limit - just set short cooldown (does NOT set quota_reset_ts)
+                cooldown_seconds = classified_error.retry_after or 60
+                model_cooldowns[model] = now_ts + cooldown_seconds
+                lib_logger.info(
+                    f"Rate limit on {mask_credential(key)} for model {model}. "
+                    f"Transient cooldown: {cooldown_seconds}s"
+                )
             elif classified_error.error_type == "authentication":
                 # Apply a 5-minute key-level lockout for auth errors
+                key_data["key_cooldown_until"] = now_ts + 300
+                cooldown_seconds = 300
+                model_cooldowns[model] = now_ts + cooldown_seconds
                 lib_logger.warning(
                     f"Authentication error on key {mask_credential(key)}. Applying 5-minute key-level lockout."
                 )
             # If we should increment failures, calculate escalating backoff
             if should_increment:
                 # If cooldown wasn't set by specific error type, use escalating backoff
                 if cooldown_seconds is None:
                     backoff_tiers = {1: 10, 2: 30, 3: 60, 4: 120}
+                    cooldown_seconds = backoff_tiers.get(count, 7200)
+                    model_cooldowns[model] = now_ts + cooldown_seconds
                     lib_logger.warning(
                         f"Failure #{count} for key {mask_credential(key)} with model {model}. "
+                        f"Error type: {classified_error.error_type}, cooldown: {cooldown_seconds}s"
                     )
             else:
                 # Provider-level errors: apply short cooldown but don't count against key
                 if cooldown_seconds is None:
+                    cooldown_seconds = 30
+                    model_cooldowns[model] = now_ts + cooldown_seconds
                 lib_logger.info(
+                    f"Provider-level error ({classified_error.error_type}) for key {mask_credential(key)} "
+                    f"with model {model}. NOT incrementing failures. Cooldown: {cooldown_seconds}s"
                 )
             # Check for key-level lockout condition
             await self._check_key_lockout(key, key_data)
             key_data["last_failure"] = {
+                "timestamp": now_ts,
                 "model": model,
                 "error": str(classified_error.original_exception),
             }