Spaces:

elmerzole
/

llm-api-proxy

Paused

File size: 114,063 Bytes

bf565ba
 
 
d51ae99
aa8035e
f35e0e7
27b342a
467f294
 
4bbfff4
21dcb11
27b342a
fce1762
63418af
7f148b3
467f294
bf565ba
39e01ca
4bbfff4
d51ae99
 
 
39e01ca
bf565ba
 
4bbfff4
f35e0e7
4dfb828
f35e0e7
4dfb828
f35e0e7
 
 
4dfb828
f35e0e7
 
 
 
4dfb828
f35e0e7
 
 
4dfb828
f35e0e7
4dfb828
f35e0e7
 
bd84d38
 
 
 
bf565ba
39e01ca
 
 
467f294
39e01ca
f35e0e7
bd84d38
98f6823
aefb706
 
 
 
 
39e01ca
f35e0e7
 
4dfb828
f35e0e7
467f294
 
f35e0e7
 
 
 
 
bd84d38
 
 
98f6823
 
aefb706
 
 
 
 
 
 
 
f35e0e7
467f294
 
 
 
 
 
 
 
f35e0e7
bd84d38
98f6823
aefb706
 
 
fd01482
bab8bf0
39e01ca
4bbfff4
 
 
 
 
 
 
bf565ba
7f148b3
 
bf565ba
27b342a
39e01ca
 
 
 
27b342a
 
 
bd84d38
 
 
 
 
 
 
 
 
 
 
 
aefb706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98f6823
 
 
 
 
 
 
1ac7bd0
98f6823
 
 
 
 
 
 
 
 
 
 
 
 
1ac7bd0
98f6823
 
 
 
 
 
 
 
 
1ac7bd0
 
 
 
 
98f6823
 
fd01482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98f6823
 
 
 
 
 
 
 
 
 
 
 
fd01482
98f6823
fd01482
 
98f6823
 
 
0ca1651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd01482
0ca1651
fd01482
 
0ca1651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd01482
0ca1651
fd01482
 
0ca1651
 
 
 
 
aefb706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48b6b15
 
 
 
aefb706
 
 
 
48b6b15
 
 
 
1ac7bd0
 
 
 
aefb706
 
 
 
 
1ac7bd0
aefb706
48b6b15
 
 
 
 
 
 
 
 
1ac7bd0
 
 
 
 
 
 
 
aefb706
 
 
 
 
 
 
 
 
48b6b15
aefb706
 
 
 
 
48b6b15
aefb706
1af1879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98f6823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd01482
 
 
 
98f6823
 
 
48b6b15
 
 
98f6823
0ca1651
 
 
 
 
98f6823
 
 
 
48b6b15
 
 
98f6823
 
48b6b15
98f6823
 
 
 
 
0ca1651
98f6823
0ca1651
48b6b15
 
0ca1651
48b6b15
0ca1651
48b6b15
0ca1651
98f6823
4cdd261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aefb706
bd84d38
 
 
aefb706
bd84d38
aefb706
bd84d38
aefb706
 
bd84d38
aefb706
 
 
 
 
bd84d38
 
 
 
 
 
aefb706
bd84d38
 
aefb706
bd84d38
 
aefb706
bd84d38
aefb706
 
bd84d38
 
 
 
 
 
 
 
aefb706
 
 
 
 
 
bd84d38
 
 
aefb706
 
 
 
 
 
 
bd84d38
aefb706
bd84d38
4bbfff4
27b342a
4bbfff4
 
 
 
 
 
 
7f148b3
4bbfff4
bf565ba
4bbfff4
 
31c3d36
bf565ba
39e01ca
4bbfff4
31c3d36
 
7f148b3
31c3d36
 
01932f9
 
 
31c3d36
 
01932f9
 
 
4bbfff4
 
 
7f148b3
4bbfff4
 
01932f9
7f148b3
 
 
 
 
4bbfff4
2be7535
 
 
 
 
 
 
 
 
 
 
818ba14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bbfff4
98f6823
 
 
0ca1651
 
 
 
98f6823
 
4bbfff4
 
27b342a
98f6823
27b342a
aea7b14
27b342a
4bbfff4
98f6823
33e95e6
98f6823
0ca1651
39e01ca
0ca1651
 
 
 
4dfb828
0ca1651
 
 
 
 
98f6823
0ca1651
98f6823
 
39e01ca
27b342a
aea7b14
4bbfff4
aea7b14
0ca1651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2be7535
 
0ca1651
 
 
2be7535
 
 
 
 
 
 
 
 
 
0ca1651
98f6823
 
 
 
 
 
 
 
 
39e01ca
98f6823
 
 
 
 
27b342a
98f6823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aea7b14
bab8bf0
 
aa8035e
bab8bf0
 
 
 
79e83ae
bab8bf0
aa8035e
4dfb828
f35e0e7
 
4dfb828
f35e0e7
 
 
4dfb828
f35e0e7
 
4dfb828
f35e0e7
 
4dfb828
f35e0e7
 
 
 
 
 
 
4dfb828
f35e0e7
 
4dfb828
f35e0e7
 
 
4dfb828
f35e0e7
 
 
 
 
4dfb828
f35e0e7
 
 
 
4dfb828
f35e0e7
 
4dfb828
 
f35e0e7
 
4dfb828
f35e0e7
4dfb828
f35e0e7
 
39e01ca
4dfb828
 
 
 
b5da45c
4dfb828
 
39e01ca
bab8bf0
62ed41b
b5da45c
4dfb828
b5da45c
 
 
 
 
4dfb828
b5da45c
 
 
 
 
 
4dfb828
 
b5da45c
 
4dfb828
b5da45c
 
bab8bf0
4bbfff4
aaab6f8
bab8bf0
 
62ed41b
 
 
39e01ca
b5da45c
 
 
 
 
 
 
4dfb828
b5da45c
 
 
 
 
4dfb828
b5da45c
 
4dfb828
b5da45c
aefb706
 
4dfb828
b5da45c
 
 
 
4dfb828
b5da45c
 
4dfb828
b5da45c
 
4dfb828
aefb706
 
 
 
 
 
 
 
 
 
b5da45c
 
 
 
4dfb828
b5da45c
 
 
 
aefb706
 
 
 
b5da45c
4dfb828
bd84d38
aefb706
 
bd84d38
 
aefb706
bd84d38
 
 
aefb706
bd84d38
 
 
 
 
f35e0e7
4dfb828
 
 
 
 
 
f35e0e7
4dfb828
 
 
 
 
 
f35e0e7
 
bd84d38
f35e0e7
 
4dfb828
b5da45c
 
 
 
 
 
4dfb828
 
 
 
 
1af1879
b5da45c
4dfb828
1af1879
b5da45c
 
4dfb828
b5da45c
 
 
 
 
aefb706
b5da45c
4dfb828
 
 
 
 
1af1879
b5da45c
4dfb828
1af1879
b5da45c
 
4dfb828
b5da45c
 
 
 
 
4dfb828
b5da45c
 
 
 
 
 
4dfb828
b5da45c
 
 
 
 
4dfb828
b5da45c
 
 
4dfb828
b5da45c
 
aefb706
 
 
 
 
 
 
 
 
 
 
 
 
b5da45c
 
 
 
 
 
 
 
 
 
 
 
 
aefb706
 
b5da45c
 
 
 
 
 
aefb706
 
 
 
b5da45c
 
bd84d38
aefb706
 
bd84d38
 
aefb706
bd84d38
 
 
aefb706
bd84d38
 
 
 
 
f35e0e7
4dfb828
 
 
 
 
 
f35e0e7
4dfb828
 
 
 
 
 
f35e0e7
 
bd84d38
f35e0e7
 
b5da45c
 
f35e0e7
b5da45c
 
 
 
4dfb828
 
 
 
 
 
1af1879
b5da45c
4dfb828
1af1879
b5da45c
 
 
 
f35e0e7
b5da45c
 
 
aefb706
b5da45c
4dfb828
 
 
 
 
 
1af1879
b5da45c
4dfb828
1af1879
b5da45c
 
39e01ca
b5da45c
 
 
39e01ca
bab8bf0
b5da45c
 
 
 
 
 
 
 
 
 
 
39e01ca
bab8bf0
 
62ed41b
 
39e01ca
62ed41b
39e01ca
 
 
bab8bf0
 
62ed41b
83934f3
bab8bf0
39e01ca
 
 
 
bab8bf0
 
 
 
 
 
 
 
 
79e83ae
 
 
 
 
fce1762
79e83ae
 
bab8bf0
39e01ca
fce1762
39e01ca
bab8bf0
 
 
 
4bbfff4
39e01ca
 
 
 
 
 
27b342a
 
 
98f6823
0ca1651
 
 
27b342a
4bbfff4
 
98f6823
33e95e6
98f6823
 
0ca1651
 
39e01ca
 
0ca1651
 
98f6823
 
 
0ca1651
98f6823
 
 
 
 
0ca1651
 
 
 
 
 
 
 
 
 
 
 
2be7535
 
0ca1651
 
 
 
 
 
 
 
 
4cdd261
 
 
 
0ca1651
4cdd261
 
 
 
0ca1651
 
 
 
 
 
2be7535
 
1ac7bd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2be7535
 
 
 
 
 
 
0ca1651
 
98f6823
0ca1651
98f6823
 
 
 
 
 
 
 
 
39e01ca
0ca1651
 
39e01ca
0ca1651
 
 
 
 
 
 
 
 
 
 
 
 
27b342a
 
0ca1651
 
27b342a
 
 
0ca1651
39e01ca
 
 
 
 
27b342a
0ca1651
 
39e01ca
0ca1651
39e01ca
fce1762
39e01ca
27b342a
39e01ca
fd01482
63418af
fd01482
 
3962356
39e01ca
 
 
64e385a
63418af
19af906
 
 
39e01ca
 
 
19af906
 
63418af
39e01ca
 
 
 
63418af
0ca1651
27b342a
39e01ca
 
 
 
 
 
0ca1651
27b342a
39e01ca
 
 
bf565ba
98f6823
39e01ca
4bbfff4
 
39e01ca
4dfb828
 
 
 
 
39e01ca
0ca1651
 
 
 
 
 
 
20944a5
 
 
 
 
 
 
 
4bbfff4
 
0ca1651
33e95e6
98f6823
 
0ca1651
 
39e01ca
 
98f6823
0ca1651
98f6823
 
 
0ca1651
98f6823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e01ca
20944a5
 
 
 
 
 
 
 
 
 
 
0ca1651
20944a5
0ca1651
 
 
20944a5
0ca1651
 
 
 
 
 
 
 
 
 
2be7535
 
0ca1651
 
 
 
 
 
2be7535
 
 
1ac7bd0
0ca1651
 
 
 
 
 
 
 
 
 
 
 
2be7535
 
0ca1651
 
 
 
 
 
1ac7bd0
 
 
 
 
 
 
 
 
0ca1651
 
 
 
 
 
bd84d38
0ca1651
 
bd84d38
 
0ca1651
 
 
 
bd84d38
0ca1651
 
bd84d38
0ca1651
 
 
bd84d38
0ca1651
 
 
bd84d38
0ca1651
 
bd84d38
0ca1651
 
 
 
 
20944a5
0ca1651
 
20944a5
0ca1651
39e01ca
27b342a
0ca1651
 
 
39e01ca
fce1762
39e01ca
20944a5
 
 
27b342a
39e01ca
 
 
27b342a
 
 
20944a5
 
 
0ca1651
 
20944a5
fce1762
0ca1651
20944a5
 
 
 
0ca1651
 
20944a5
0ca1651
 
20944a5
27b342a
 
 
 
2be7535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ac7bd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27b342a
0ca1651
4bbfff4
39e01ca
4bbfff4
39e01ca
4bbfff4
27b342a
2be7535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39e01ca
2be7535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ac7bd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2be7535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b4ff52

import json
import os
import time
import logging
import asyncio
import random
from datetime import date, datetime, timezone, time as dt_time
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple, Union
import aiofiles
import litellm

from .error_handler import ClassifiedError, NoAvailableKeysError, mask_credential
from .providers import PROVIDER_PLUGINS
from .utils.resilient_io import ResilientStateWriter
from .utils.paths import get_data_file

lib_logger = logging.getLogger("rotator_library")
lib_logger.propagate = False
if not lib_logger.handlers:
    lib_logger.addHandler(logging.NullHandler())


class UsageManager:
    """
    Manages usage statistics and cooldowns for API keys with asyncio-safe locking,
    asynchronous file I/O, lazy-loading mechanism, and weighted random credential rotation.

    The credential rotation strategy can be configured via the `rotation_tolerance` parameter:

    - **tolerance = 0.0**: Deterministic least-used selection. The credential with
      the lowest usage count is always selected. This provides predictable, perfectly balanced
      load distribution but may be vulnerable to fingerprinting.

    - **tolerance = 2.0 - 4.0 (default, recommended)**: Balanced weighted randomness. Credentials are selected
      randomly with weights biased toward less-used ones. Credentials within 2 uses of the
      maximum can still be selected with reasonable probability. This provides security through
      unpredictability while maintaining good load balance.

    - **tolerance = 5.0+**: High randomness. Even heavily-used credentials have significant
      selection probability. Useful for stress testing or maximum unpredictability, but may
      result in less balanced load distribution.

    The weight formula is: `weight = (max_usage - credential_usage) + tolerance + 1`

    This ensures lower-usage credentials are preferred while tolerance controls how much
    randomness is introduced into the selection process.

    Additionally, providers can specify a rotation mode:
    - "balanced" (default): Rotate credentials to distribute load evenly
    - "sequential": Use one credential until exhausted (preserves caching)
    """

    def __init__(
        self,
        file_path: Optional[Union[str, Path]] = None,
        daily_reset_time_utc: Optional[str] = "03:00",
        rotation_tolerance: float = 0.0,
        provider_rotation_modes: Optional[Dict[str, str]] = None,
        provider_plugins: Optional[Dict[str, Any]] = None,
        priority_multipliers: Optional[Dict[str, Dict[int, int]]] = None,
        priority_multipliers_by_mode: Optional[
            Dict[str, Dict[str, Dict[int, int]]]
        ] = None,
        sequential_fallback_multipliers: Optional[Dict[str, int]] = None,
    ):
        """
        Initialize the UsageManager.

        Args:
            file_path: Path to the usage data JSON file. If None, uses get_data_file("key_usage.json").
                       Can be absolute Path, relative Path, or string.
            daily_reset_time_utc: Time in UTC when daily stats should reset (HH:MM format)
            rotation_tolerance: Tolerance for weighted random credential rotation.
                - 0.0: Deterministic, least-used credential always selected
                - tolerance = 2.0 - 4.0 (default, recommended): Balanced randomness, can pick credentials within 2 uses of max
                - 5.0+: High randomness, more unpredictable selection patterns
            provider_rotation_modes: Dict mapping provider names to rotation modes.
                - "balanced": Rotate credentials to distribute load evenly (default)
                - "sequential": Use one credential until exhausted (preserves caching)
            provider_plugins: Dict mapping provider names to provider plugin instances.
                Used for per-provider usage reset configuration (window durations, field names).
            priority_multipliers: Dict mapping provider -> priority -> multiplier.
                Universal multipliers that apply regardless of rotation mode.
                Example: {"antigravity": {1: 5, 2: 3}}
            priority_multipliers_by_mode: Dict mapping provider -> mode -> priority -> multiplier.
                Mode-specific overrides. Example: {"antigravity": {"balanced": {3: 1}}}
            sequential_fallback_multipliers: Dict mapping provider -> fallback multiplier.
                Used in sequential mode when priority not in priority_multipliers.
                Example: {"antigravity": 2}
        """
        # Resolve file_path - use default if not provided
        if file_path is None:
            self.file_path = str(get_data_file("key_usage.json"))
        elif isinstance(file_path, Path):
            self.file_path = str(file_path)
        else:
            # String path - could be relative or absolute
            self.file_path = file_path
        self.rotation_tolerance = rotation_tolerance
        self.provider_rotation_modes = provider_rotation_modes or {}
        self.provider_plugins = provider_plugins or PROVIDER_PLUGINS
        self.priority_multipliers = priority_multipliers or {}
        self.priority_multipliers_by_mode = priority_multipliers_by_mode or {}
        self.sequential_fallback_multipliers = sequential_fallback_multipliers or {}
        self._provider_instances: Dict[str, Any] = {}  # Cache for provider instances
        self.key_states: Dict[str, Dict[str, Any]] = {}

        self._data_lock = asyncio.Lock()
        self._usage_data: Optional[Dict] = None
        self._initialized = asyncio.Event()
        self._init_lock = asyncio.Lock()

        self._timeout_lock = asyncio.Lock()
        self._claimed_on_timeout: Set[str] = set()

        # Resilient writer for usage data persistence
        self._state_writer = ResilientStateWriter(file_path, lib_logger)

        if daily_reset_time_utc:
            hour, minute = map(int, daily_reset_time_utc.split(":"))
            self.daily_reset_time_utc = dt_time(
                hour=hour, minute=minute, tzinfo=timezone.utc
            )
        else:
            self.daily_reset_time_utc = None

    def _get_rotation_mode(self, provider: str) -> str:
        """
        Get the rotation mode for a provider.

        Args:
            provider: Provider name (e.g., "antigravity", "gemini_cli")

        Returns:
            "balanced" or "sequential"
        """
        return self.provider_rotation_modes.get(provider, "balanced")

    def _get_priority_multiplier(
        self, provider: str, priority: int, rotation_mode: str
    ) -> int:
        """
        Get the concurrency multiplier for a provider/priority/mode combination.

        Lookup order:
        1. Mode-specific tier override: priority_multipliers_by_mode[provider][mode][priority]
        2. Universal tier multiplier: priority_multipliers[provider][priority]
        3. Sequential fallback (if mode is sequential): sequential_fallback_multipliers[provider]
        4. Global default: 1 (no multiplier effect)

        Args:
            provider: Provider name (e.g., "antigravity")
            priority: Priority level (1 = highest priority)
            rotation_mode: Current rotation mode ("sequential" or "balanced")

        Returns:
            Multiplier value
        """
        provider_lower = provider.lower()

        # 1. Check mode-specific override
        if provider_lower in self.priority_multipliers_by_mode:
            mode_multipliers = self.priority_multipliers_by_mode[provider_lower]
            if rotation_mode in mode_multipliers:
                if priority in mode_multipliers[rotation_mode]:
                    return mode_multipliers[rotation_mode][priority]

        # 2. Check universal tier multiplier
        if provider_lower in self.priority_multipliers:
            if priority in self.priority_multipliers[provider_lower]:
                return self.priority_multipliers[provider_lower][priority]

        # 3. Sequential fallback (only for sequential mode)
        if rotation_mode == "sequential":
            if provider_lower in self.sequential_fallback_multipliers:
                return self.sequential_fallback_multipliers[provider_lower]

        # 4. Global default
        return 1

    def _get_provider_from_credential(self, credential: str) -> Optional[str]:
        """
        Extract provider name from credential path or identifier.

        Supports multiple credential formats:
        - OAuth: "oauth_creds/antigravity_oauth_15.json" -> "antigravity"
        - OAuth: "C:\\...\\oauth_creds\\gemini_cli_oauth_1.json" -> "gemini_cli"
        - OAuth filename only: "antigravity_oauth_1.json" -> "antigravity"
        - API key style: stored with provider prefix metadata

        Args:
            credential: The credential identifier (path or key)

        Returns:
            Provider name string or None if cannot be determined
        """
        import re

        # Normalize path separators
        normalized = credential.replace("\\", "/")

        # Pattern: path ending with {provider}_oauth_{number}.json
        match = re.search(r"/([a-z_]+)_oauth_\d+\.json$", normalized, re.IGNORECASE)
        if match:
            return match.group(1).lower()

        # Pattern: oauth_creds/{provider}_...
        match = re.search(r"oauth_creds/([a-z_]+)_", normalized, re.IGNORECASE)
        if match:
            return match.group(1).lower()

        # Pattern: filename only {provider}_oauth_{number}.json (no path)
        match = re.match(r"([a-z_]+)_oauth_\d+\.json$", normalized, re.IGNORECASE)
        if match:
            return match.group(1).lower()

        return None

    def _get_provider_instance(self, provider: str) -> Optional[Any]:
        """
        Get or create a provider plugin instance.

        Args:
            provider: The provider name

        Returns:
            Provider plugin instance or None
        """
        if not provider:
            return None

        plugin_class = self.provider_plugins.get(provider)
        if not plugin_class:
            return None

        # Get or create provider instance from cache
        if provider not in self._provider_instances:
            # Instantiate the plugin if it's a class, or use it directly if already an instance
            if isinstance(plugin_class, type):
                self._provider_instances[provider] = plugin_class()
            else:
                self._provider_instances[provider] = plugin_class

        return self._provider_instances[provider]

    def _get_usage_reset_config(self, credential: str) -> Optional[Dict[str, Any]]:
        """
        Get the usage reset configuration for a credential from its provider plugin.

        Args:
            credential: The credential identifier

        Returns:
            Configuration dict with window_seconds, field_name, etc.
            or None to use default daily reset.
        """
        provider = self._get_provider_from_credential(credential)
        plugin_instance = self._get_provider_instance(provider)

        if plugin_instance and hasattr(plugin_instance, "get_usage_reset_config"):
            return plugin_instance.get_usage_reset_config(credential)

        return None

    def _get_reset_mode(self, credential: str) -> str:
        """
        Get the reset mode for a credential: 'credential' or 'per_model'.

        Args:
            credential: The credential identifier

        Returns:
            "per_model" or "credential" (default)
        """
        config = self._get_usage_reset_config(credential)
        return config.get("mode", "credential") if config else "credential"

    def _get_model_quota_group(self, credential: str, model: str) -> Optional[str]:
        """
        Get the quota group for a model, if the provider defines one.

        Args:
            credential: The credential identifier
            model: Model name (with or without provider prefix)

        Returns:
            Group name (e.g., "claude") or None if not grouped
        """
        provider = self._get_provider_from_credential(credential)
        plugin_instance = self._get_provider_instance(provider)

        if plugin_instance and hasattr(plugin_instance, "get_model_quota_group"):
            return plugin_instance.get_model_quota_group(model)

        return None

    def _get_grouped_models(self, credential: str, group: str) -> List[str]:
        """
        Get all model names in a quota group (with provider prefix).

        Args:
            credential: The credential identifier
            group: Group name (e.g., "claude")

        Returns:
            List of full model names (e.g., ["antigravity/claude-opus-4-5", ...])
        """
        provider = self._get_provider_from_credential(credential)
        plugin_instance = self._get_provider_instance(provider)

        if plugin_instance and hasattr(plugin_instance, "get_models_in_quota_group"):
            models = plugin_instance.get_models_in_quota_group(group)
            # Add provider prefix
            return [f"{provider}/{m}" for m in models]

        return []

    def _get_model_usage_weight(self, credential: str, model: str) -> int:
        """
        Get the usage weight for a model when calculating grouped usage.

        Args:
            credential: The credential identifier
            model: Model name (with or without provider prefix)

        Returns:
            Weight multiplier (default 1 if not configured)
        """
        provider = self._get_provider_from_credential(credential)
        plugin_instance = self._get_provider_instance(provider)

        if plugin_instance and hasattr(plugin_instance, "get_model_usage_weight"):
            return plugin_instance.get_model_usage_weight(model)

        return 1

    # Providers where request_count should be used for credential selection
    # instead of success_count (because failed requests also consume quota)
    _REQUEST_COUNT_PROVIDERS = {"antigravity"}

    def _get_grouped_usage_count(self, key: str, model: str) -> int:
        """
        Get usage count for credential selection, considering quota groups.

        For providers in _REQUEST_COUNT_PROVIDERS (e.g., antigravity), uses
        request_count instead of success_count since failed requests also
        consume quota.

        If the model belongs to a quota group, the request_count is already
        synced across all models in the group (by record_success/record_failure),
        so we just read from the requested model directly.

        Args:
            key: Credential identifier
            model: Model name (with provider prefix, e.g., "antigravity/claude-sonnet-4-5")

        Returns:
            Usage count for the model (synced across group if applicable)
        """
        # Determine usage field based on provider
        # Some providers (antigravity) count failed requests against quota
        provider = self._get_provider_from_credential(key)
        usage_field = (
            "request_count"
            if provider in self._REQUEST_COUNT_PROVIDERS
            else "success_count"
        )

        # For providers with synced quota groups (antigravity), request_count
        # is already synced across all models in the group, so just read directly.
        # For other providers, we still need to sum success_count across group.
        if provider in self._REQUEST_COUNT_PROVIDERS:
            # request_count is synced - just read the model's value
            return self._get_usage_count(key, model, usage_field)

        # For non-synced providers, check if model is in a quota group and sum
        group = self._get_model_quota_group(key, model)

        if group:
            # Get all models in the group
            grouped_models = self._get_grouped_models(key, group)

            # Sum weighted usage across all models in the group
            total_weighted_usage = 0
            for grouped_model in grouped_models:
                usage = self._get_usage_count(key, grouped_model, usage_field)
                weight = self._get_model_usage_weight(key, grouped_model)
                total_weighted_usage += usage * weight
            return total_weighted_usage

        # Not grouped - return individual model usage (no weight applied)
        return self._get_usage_count(key, model, usage_field)

    def _get_quota_display(self, key: str, model: str) -> str:
        """
        Get a formatted quota display string for logging.

        For antigravity (providers in _REQUEST_COUNT_PROVIDERS), returns:
            "quota: 170/250 [32%]" format

        For other providers, returns:
            "usage: 170" format (no max available)

        Args:
            key: Credential identifier
            model: Model name (with provider prefix)

        Returns:
            Formatted string for logging
        """
        provider = self._get_provider_from_credential(key)

        if provider not in self._REQUEST_COUNT_PROVIDERS:
            # Non-antigravity: just show usage count
            usage = self._get_usage_count(key, model, "success_count")
            return f"usage: {usage}"

        # Antigravity: show quota display with remaining percentage
        if self._usage_data is None:
            return "quota: 0/? [100%]"

        key_data = self._usage_data.get(key, {})
        model_data = key_data.get("models", {}).get(model, {})

        request_count = model_data.get("request_count", 0)
        max_requests = model_data.get("quota_max_requests")

        if max_requests:
            remaining = max_requests - request_count
            remaining_pct = (
                int((remaining / max_requests) * 100) if max_requests > 0 else 0
            )
            return f"quota: {request_count}/{max_requests} [{remaining_pct}%]"
        else:
            return f"quota: {request_count}"

    def _get_usage_field_name(self, credential: str) -> str:
        """
        Get the usage tracking field name for a credential.

        Returns the provider-specific field name if configured,
        otherwise falls back to "daily".

        Args:
            credential: The credential identifier

        Returns:
            Field name string (e.g., "5h_window", "weekly", "daily")
        """
        config = self._get_usage_reset_config(credential)
        if config and "field_name" in config:
            return config["field_name"]

        # Check provider default
        provider = self._get_provider_from_credential(credential)
        plugin_instance = self._get_provider_instance(provider)

        if plugin_instance and hasattr(plugin_instance, "get_default_usage_field_name"):
            return plugin_instance.get_default_usage_field_name()

        return "daily"

    def _get_usage_count(
        self, key: str, model: str, field: str = "success_count"
    ) -> int:
        """
        Get the current usage count for a model from the appropriate usage structure.

        Supports both:
        - New per-model structure: {"models": {"model_name": {"success_count": N, ...}}}
        - Legacy structure: {"daily": {"models": {"model_name": {"success_count": N, ...}}}}

        Args:
            key: Credential identifier
            model: Model name
            field: The field to read for usage count (default: "success_count").
                   Use "request_count" for providers where failed requests also
                   consume quota (e.g., antigravity).

        Returns:
            Usage count for the model in the current window/period
        """
        if self._usage_data is None:
            return 0

        key_data = self._usage_data.get(key, {})
        reset_mode = self._get_reset_mode(key)

        if reset_mode == "per_model":
            # New per-model structure: key_data["models"][model][field]
            return key_data.get("models", {}).get(model, {}).get(field, 0)
        else:
            # Legacy structure: key_data["daily"]["models"][model][field]
            return (
                key_data.get("daily", {}).get("models", {}).get(model, {}).get(field, 0)
            )

    # =========================================================================
    # TIMESTAMP FORMATTING HELPERS
    # =========================================================================

    def _format_timestamp_local(self, ts: Optional[float]) -> Optional[str]:
        """
        Format Unix timestamp as local time string with timezone offset.

        Args:
            ts: Unix timestamp or None

        Returns:
            Formatted string like "2025-12-07 14:30:17 +0100" or None
        """
        if ts is None:
            return None
        try:
            dt = datetime.fromtimestamp(ts).astimezone()  # Local timezone
            # Use UTC offset for conciseness (works on all platforms)
            return dt.strftime("%Y-%m-%d %H:%M:%S %z")
        except (OSError, ValueError, OverflowError):
            return None

    def _add_readable_timestamps(self, data: Dict) -> Dict:
        """
        Add human-readable timestamp fields to usage data before saving.

        Adds 'window_started' and 'quota_resets' fields derived from
        Unix timestamps for easier debugging and monitoring.

        Args:
            data: The usage data dict to enhance

        Returns:
            The same dict with readable timestamp fields added
        """
        for key, key_data in data.items():
            # Handle per-model structure
            models = key_data.get("models", {})
            for model_name, model_stats in models.items():
                if not isinstance(model_stats, dict):
                    continue

                # Add readable window start time
                window_start = model_stats.get("window_start_ts")
                if window_start:
                    model_stats["window_started"] = self._format_timestamp_local(
                        window_start
                    )
                elif "window_started" in model_stats:
                    del model_stats["window_started"]

                # Add readable reset time
                quota_reset = model_stats.get("quota_reset_ts")
                if quota_reset:
                    model_stats["quota_resets"] = self._format_timestamp_local(
                        quota_reset
                    )
                elif "quota_resets" in model_stats:
                    del model_stats["quota_resets"]

        return data

    def _sort_sequential(
        self,
        candidates: List[Tuple[str, int]],
        credential_priorities: Optional[Dict[str, int]] = None,
    ) -> List[Tuple[str, int]]:
        """
        Sort credentials for sequential mode with position retention.

        Credentials maintain their position based on established usage patterns,
        ensuring that actively-used credentials remain primary until exhausted.

        Sorting order (within each sort key, lower value = higher priority):
        1. Priority tier (lower number = higher priority)
        2. Usage count (higher = more established in rotation, maintains position)
        3. Last used timestamp (higher = more recent, tiebreaker for stickiness)
        4. Credential ID (alphabetical, stable ordering)

        Args:
            candidates: List of (credential_id, usage_count) tuples
            credential_priorities: Optional dict mapping credentials to priority levels

        Returns:
            Sorted list of candidates (same format as input)
        """
        if not candidates:
            return []

        if len(candidates) == 1:
            return candidates

        def sort_key(item: Tuple[str, int]) -> Tuple[int, int, float, str]:
            cred, usage_count = item
            priority = (
                credential_priorities.get(cred, 999) if credential_priorities else 999
            )
            last_used = (
                self._usage_data.get(cred, {}).get("last_used_ts", 0)
                if self._usage_data
                else 0
            )
            return (
                priority,  # ASC: lower priority number = higher priority
                -usage_count,  # DESC: higher usage = more established
                -last_used,  # DESC: more recent = preferred for ties
                cred,  # ASC: stable alphabetical ordering
            )

        sorted_candidates = sorted(candidates, key=sort_key)

        # Debug logging - show top 3 credentials in ordering
        if lib_logger.isEnabledFor(logging.DEBUG):
            order_info = [
                f"{mask_credential(c)}(p={credential_priorities.get(c, 999) if credential_priorities else 'N/A'}, u={u})"
                for c, u in sorted_candidates[:3]
            ]
            lib_logger.debug(f"Sequential ordering: {' → '.join(order_info)}")

        return sorted_candidates

    async def _lazy_init(self):
        """Initializes the usage data by loading it from the file asynchronously."""
        async with self._init_lock:
            if not self._initialized.is_set():
                await self._load_usage()
                await self._reset_daily_stats_if_needed()
                self._initialized.set()

    async def _load_usage(self):
        """Loads usage data from the JSON file asynchronously with resilience."""
        async with self._data_lock:
            if not os.path.exists(self.file_path):
                self._usage_data = {}
                return

            try:
                async with aiofiles.open(self.file_path, "r") as f:
                    content = await f.read()
                    self._usage_data = json.loads(content) if content.strip() else {}
            except FileNotFoundError:
                # File deleted between exists check and open
                self._usage_data = {}
            except json.JSONDecodeError as e:
                lib_logger.warning(
                    f"Corrupted usage file {self.file_path}: {e}. Starting fresh."
                )
                self._usage_data = {}
            except (OSError, PermissionError, IOError) as e:
                lib_logger.warning(
                    f"Cannot read usage file {self.file_path}: {e}. Using empty state."
                )
                self._usage_data = {}

    async def _save_usage(self):
        """Saves the current usage data using the resilient state writer."""
        if self._usage_data is None:
            return

        async with self._data_lock:
            # Add human-readable timestamp fields before saving
            self._add_readable_timestamps(self._usage_data)
            # Hand off to resilient writer - handles retries and disk failures
            self._state_writer.write(self._usage_data)

    async def _get_usage_data_snapshot(self) -> Dict[str, Any]:
        """
        Get a shallow copy of the current usage data.

        Returns:
            Copy of usage data dict (safe for reading without lock)
        """
        await self._lazy_init()
        async with self._data_lock:
            return dict(self._usage_data) if self._usage_data else {}

    async def get_available_credentials_for_model(
        self, credentials: List[str], model: str
    ) -> List[str]:
        """
        Get credentials that are not on cooldown for a specific model.

        Filters out credentials where:
        - key_cooldown_until > now (key-level cooldown)
        - model_cooldowns[model] > now (model-specific cooldown, includes quota exhausted)

        Args:
            credentials: List of credential identifiers to check
            model: Model name to check cooldowns for

        Returns:
            List of credentials that are available (not on cooldown) for this model
        """
        await self._lazy_init()
        now = time.time()
        available = []

        async with self._data_lock:
            for key in credentials:
                key_data = self._usage_data.get(key, {})

                # Skip if key-level cooldown is active
                if (key_data.get("key_cooldown_until") or 0) > now:
                    continue

                # Skip if model-specific cooldown is active
                if (key_data.get("model_cooldowns", {}).get(model) or 0) > now:
                    continue

                available.append(key)

        return available

    async def _reset_daily_stats_if_needed(self):
        """
        Checks if usage stats need to be reset for any key.

        Supports three reset modes:
        1. per_model: Each model has its own window, resets based on quota_reset_ts or fallback window
        2. credential: One window per credential (legacy with custom window duration)
        3. daily: Legacy daily reset at daily_reset_time_utc
        """
        if self._usage_data is None:
            return

        now_utc = datetime.now(timezone.utc)
        now_ts = time.time()
        today_str = now_utc.date().isoformat()
        needs_saving = False

        for key, data in self._usage_data.items():
            reset_config = self._get_usage_reset_config(key)

            if reset_config:
                reset_mode = reset_config.get("mode", "credential")

                if reset_mode == "per_model":
                    # Per-model window reset
                    needs_saving |= await self._check_per_model_resets(
                        key, data, reset_config, now_ts
                    )
                else:
                    # Credential-level window reset (legacy)
                    needs_saving |= await self._check_window_reset(
                        key, data, reset_config, now_ts
                    )
            elif self.daily_reset_time_utc:
                # Legacy daily reset
                needs_saving |= await self._check_daily_reset(
                    key, data, now_utc, today_str, now_ts
                )

        if needs_saving:
            await self._save_usage()

    async def _check_per_model_resets(
        self,
        key: str,
        data: Dict[str, Any],
        reset_config: Dict[str, Any],
        now_ts: float,
    ) -> bool:
        """
        Check and perform per-model resets for a credential.

        Each model resets independently based on:
        1. quota_reset_ts (authoritative, from quota exhausted error) if set
        2. window_start_ts + window_seconds (fallback) otherwise

        Grouped models reset together - all models in a group must be ready.

        Args:
            key: Credential identifier
            data: Usage data for this credential
            reset_config: Provider's reset configuration
            now_ts: Current timestamp

        Returns:
            True if data was modified and needs saving
        """
        window_seconds = reset_config.get("window_seconds", 86400)
        models_data = data.get("models", {})

        if not models_data:
            return False

        modified = False
        processed_groups = set()

        for model, model_data in list(models_data.items()):
            # Check if this model is in a quota group
            group = self._get_model_quota_group(key, model)

            if group:
                if group in processed_groups:
                    continue  # Already handled this group

                # Check if entire group should reset
                if self._should_group_reset(
                    key, group, models_data, window_seconds, now_ts
                ):
                    # Archive and reset all models in group
                    grouped_models = self._get_grouped_models(key, group)
                    archived_count = 0

                    for grouped_model in grouped_models:
                        if grouped_model in models_data:
                            gm_data = models_data[grouped_model]
                            self._archive_model_to_global(data, grouped_model, gm_data)
                            self._reset_model_data(gm_data)
                            archived_count += 1

                    if archived_count > 0:
                        lib_logger.info(
                            f"Reset model group '{group}' ({archived_count} models) for {mask_credential(key)}"
                        )
                        modified = True

                processed_groups.add(group)

            else:
                # Ungrouped model - check individually
                if self._should_model_reset(model_data, window_seconds, now_ts):
                    self._archive_model_to_global(data, model, model_data)
                    self._reset_model_data(model_data)
                    lib_logger.info(f"Reset model {model} for {mask_credential(key)}")
                    modified = True

        # Preserve unexpired cooldowns
        if modified:
            self._preserve_unexpired_cooldowns(key, data, now_ts)
            if "failures" in data:
                data["failures"] = {}

        return modified

    def _should_model_reset(
        self, model_data: Dict[str, Any], window_seconds: int, now_ts: float
    ) -> bool:
        """
        Check if a single model should reset.

        Returns True if:
        - quota_reset_ts is set AND now >= quota_reset_ts, OR
        - quota_reset_ts is NOT set AND now >= window_start_ts + window_seconds
        """
        quota_reset = model_data.get("quota_reset_ts")
        window_start = model_data.get("window_start_ts")

        if quota_reset:
            return now_ts >= quota_reset
        elif window_start:
            return now_ts >= window_start + window_seconds
        return False

    def _should_group_reset(
        self,
        key: str,
        group: str,
        models_data: Dict[str, Dict],
        window_seconds: int,
        now_ts: float,
    ) -> bool:
        """
        Check if all models in a group should reset.

        All models in the group must be ready to reset.
        If any model has an active cooldown/window, the whole group waits.
        """
        grouped_models = self._get_grouped_models(key, group)

        # Track if any model in group has data
        any_has_data = False

        for grouped_model in grouped_models:
            model_data = models_data.get(grouped_model, {})

            if not model_data or (
                model_data.get("window_start_ts") is None
                and model_data.get("success_count", 0) == 0
            ):
                continue  # No stats for this model yet

            any_has_data = True

            if not self._should_model_reset(model_data, window_seconds, now_ts):
                return False  # At least one model not ready

        return any_has_data

    def _archive_model_to_global(
        self, data: Dict[str, Any], model: str, model_data: Dict[str, Any]
    ) -> None:
        """Archive a single model's stats to global."""
        global_data = data.setdefault("global", {"models": {}})
        global_model = global_data["models"].setdefault(
            model,
            {
                "success_count": 0,
                "prompt_tokens": 0,
                "completion_tokens": 0,
                "approx_cost": 0.0,
            },
        )

        global_model["success_count"] += model_data.get("success_count", 0)
        global_model["prompt_tokens"] += model_data.get("prompt_tokens", 0)
        global_model["completion_tokens"] += model_data.get("completion_tokens", 0)
        global_model["approx_cost"] += model_data.get("approx_cost", 0.0)

    def _reset_model_data(self, model_data: Dict[str, Any]) -> None:
        """Reset a model's window and stats."""
        model_data["window_start_ts"] = None
        model_data["quota_reset_ts"] = None
        model_data["success_count"] = 0
        model_data["failure_count"] = 0
        model_data["request_count"] = 0
        model_data["prompt_tokens"] = 0
        model_data["completion_tokens"] = 0
        model_data["approx_cost"] = 0.0
        # Reset quota baseline fields only if they exist (Antigravity-specific)
        # These are added by update_quota_baseline(), only called for Antigravity
        if "baseline_remaining_fraction" in model_data:
            model_data["baseline_remaining_fraction"] = None
            model_data["baseline_fetched_at"] = None
            model_data["requests_at_baseline"] = None
            # Reset quota display but keep max_requests (it doesn't change between periods)
            max_req = model_data.get("quota_max_requests")
            if max_req:
                model_data["quota_display"] = f"0/{max_req}"

    async def _check_window_reset(
        self,
        key: str,
        data: Dict[str, Any],
        reset_config: Dict[str, Any],
        now_ts: float,
    ) -> bool:
        """
        Check and perform rolling window reset for a credential.

        Args:
            key: Credential identifier
            data: Usage data for this credential
            reset_config: Provider's reset configuration
            now_ts: Current timestamp

        Returns:
            True if data was modified and needs saving
        """
        window_seconds = reset_config.get("window_seconds", 86400)  # Default 24h
        field_name = reset_config.get("field_name", "window")
        description = reset_config.get("description", "rolling window")

        # Get current window data
        window_data = data.get(field_name, {})
        window_start = window_data.get("start_ts")

        # No window started yet - nothing to reset
        if window_start is None:
            return False

        # Check if window has expired
        window_end = window_start + window_seconds
        if now_ts < window_end:
            # Window still active
            return False

        # Window expired - perform reset
        hours_elapsed = (now_ts - window_start) / 3600
        lib_logger.info(
            f"Resetting {field_name} for {mask_credential(key)} - "
            f"{description} expired after {hours_elapsed:.1f}h"
        )

        # Archive to global
        self._archive_to_global(data, window_data)

        # Preserve unexpired cooldowns
        self._preserve_unexpired_cooldowns(key, data, now_ts)

        # Reset window stats (but don't start new window until first request)
        data[field_name] = {"start_ts": None, "models": {}}

        # Reset consecutive failures
        if "failures" in data:
            data["failures"] = {}

        return True

    async def _check_daily_reset(
        self,
        key: str,
        data: Dict[str, Any],
        now_utc: datetime,
        today_str: str,
        now_ts: float,
    ) -> bool:
        """
        Check and perform legacy daily reset for a credential.

        Args:
            key: Credential identifier
            data: Usage data for this credential
            now_utc: Current datetime in UTC
            today_str: Today's date as ISO string
            now_ts: Current timestamp

        Returns:
            True if data was modified and needs saving
        """
        last_reset_str = data.get("last_daily_reset", "")

        if last_reset_str == today_str:
            return False

        last_reset_dt = None
        if last_reset_str:
            try:
                last_reset_dt = datetime.fromisoformat(last_reset_str).replace(
                    tzinfo=timezone.utc
                )
            except ValueError:
                pass

        # Determine the reset threshold for today
        reset_threshold_today = datetime.combine(
            now_utc.date(), self.daily_reset_time_utc
        )

        if not (
            last_reset_dt is None or last_reset_dt < reset_threshold_today <= now_utc
        ):
            return False

        lib_logger.debug(f"Performing daily reset for key {mask_credential(key)}")

        # Preserve unexpired cooldowns
        self._preserve_unexpired_cooldowns(key, data, now_ts)

        # Reset consecutive failures
        if "failures" in data:
            data["failures"] = {}

        # Archive daily stats to global
        daily_data = data.get("daily", {})
        if daily_data:
            self._archive_to_global(data, daily_data)

        # Reset daily stats
        data["daily"] = {"date": today_str, "models": {}}
        data["last_daily_reset"] = today_str

        return True

    def _archive_to_global(
        self, data: Dict[str, Any], source_data: Dict[str, Any]
    ) -> None:
        """
        Archive usage stats from a source field (daily/window) to global.

        Args:
            data: The credential's usage data
            source_data: The source field data to archive (has "models" key)
        """
        global_data = data.setdefault("global", {"models": {}})
        for model, stats in source_data.get("models", {}).items():
            global_model_stats = global_data["models"].setdefault(
                model,
                {
                    "success_count": 0,
                    "prompt_tokens": 0,
                    "completion_tokens": 0,
                    "approx_cost": 0.0,
                },
            )
            global_model_stats["success_count"] += stats.get("success_count", 0)
            global_model_stats["prompt_tokens"] += stats.get("prompt_tokens", 0)
            global_model_stats["completion_tokens"] += stats.get("completion_tokens", 0)
            global_model_stats["approx_cost"] += stats.get("approx_cost", 0.0)

    def _preserve_unexpired_cooldowns(
        self, key: str, data: Dict[str, Any], now_ts: float
    ) -> None:
        """
        Preserve unexpired cooldowns during reset (important for long quota cooldowns).

        Args:
            key: Credential identifier (for logging)
            data: The credential's usage data
            now_ts: Current timestamp
        """
        # Preserve unexpired model cooldowns
        if "model_cooldowns" in data:
            active_cooldowns = {
                model: end_time
                for model, end_time in data["model_cooldowns"].items()
                if end_time > now_ts
            }
            if active_cooldowns:
                max_remaining = max(
                    end_time - now_ts for end_time in active_cooldowns.values()
                )
                hours_remaining = max_remaining / 3600
                lib_logger.info(
                    f"Preserving {len(active_cooldowns)} active cooldown(s) "
                    f"for key {mask_credential(key)} during reset "
                    f"(longest: {hours_remaining:.1f}h remaining)"
                )
            data["model_cooldowns"] = active_cooldowns
        else:
            data["model_cooldowns"] = {}

        # Preserve unexpired key-level cooldown
        if data.get("key_cooldown_until"):
            if data["key_cooldown_until"] <= now_ts:
                data["key_cooldown_until"] = None
            else:
                hours_remaining = (data["key_cooldown_until"] - now_ts) / 3600
                lib_logger.info(
                    f"Preserving key-level cooldown for {mask_credential(key)} "
                    f"during reset ({hours_remaining:.1f}h remaining)"
                )
        else:
            data["key_cooldown_until"] = None

    def _initialize_key_states(self, keys: List[str]):
        """Initializes state tracking for all provided keys if not already present."""
        for key in keys:
            if key not in self.key_states:
                self.key_states[key] = {
                    "lock": asyncio.Lock(),
                    "condition": asyncio.Condition(),
                    "models_in_use": {},  # Dict[model_name, concurrent_count]
                }

    def _select_weighted_random(self, candidates: List[tuple], tolerance: float) -> str:
        """
        Selects a credential using weighted random selection based on usage counts.

        Args:
            candidates: List of (credential_id, usage_count) tuples
            tolerance: Tolerance value for weight calculation

        Returns:
            Selected credential ID

        Formula:
            weight = (max_usage - credential_usage) + tolerance + 1

        This formula ensures:
            - Lower usage = higher weight = higher selection probability
            - Tolerance adds variability: higher tolerance means more randomness
            - The +1 ensures all credentials have at least some chance of selection
        """
        if not candidates:
            raise ValueError("Cannot select from empty candidate list")

        if len(candidates) == 1:
            return candidates[0][0]

        # Extract usage counts
        usage_counts = [usage for _, usage in candidates]
        max_usage = max(usage_counts)

        # Calculate weights using the formula: (max - current) + tolerance + 1
        weights = []
        for credential, usage in candidates:
            weight = (max_usage - usage) + tolerance + 1
            weights.append(weight)

        # Log weight distribution for debugging
        if lib_logger.isEnabledFor(logging.DEBUG):
            total_weight = sum(weights)
            weight_info = ", ".join(
                f"{mask_credential(cred)}: w={w:.1f} ({w / total_weight * 100:.1f}%)"
                for (cred, _), w in zip(candidates, weights)
            )
            # lib_logger.debug(f"Weighted selection candidates: {weight_info}")

        # Random selection with weights
        selected_credential = random.choices(
            [cred for cred, _ in candidates], weights=weights, k=1
        )[0]

        return selected_credential

    async def acquire_key(
        self,
        available_keys: List[str],
        model: str,
        deadline: float,
        max_concurrent: int = 1,
        credential_priorities: Optional[Dict[str, int]] = None,
        credential_tier_names: Optional[Dict[str, str]] = None,
    ) -> str:
        """
        Acquires the best available key using a tiered, model-aware locking strategy,
        respecting a global deadline and credential priorities.

        Priority Logic:
        - Groups credentials by priority level (1=highest, 2=lower, etc.)
        - Always tries highest priority (lowest number) first
        - Within same priority, sorts by usage count (load balancing)
        - Only moves to next priority if all higher-priority keys exhausted/busy

        Args:
            available_keys: List of credential identifiers to choose from
            model: Model name being requested
            deadline: Timestamp after which to stop trying
            max_concurrent: Maximum concurrent requests allowed per credential
            credential_priorities: Optional dict mapping credentials to priority levels (1=highest)
            credential_tier_names: Optional dict mapping credentials to tier names (for logging)

        Returns:
            Selected credential identifier

        Raises:
            NoAvailableKeysError: If no key could be acquired within the deadline
        """
        await self._lazy_init()
        await self._reset_daily_stats_if_needed()
        self._initialize_key_states(available_keys)

        # This loop continues as long as the global deadline has not been met.
        while time.time() < deadline:
            now = time.time()

            # Group credentials by priority level (if priorities provided)
            if credential_priorities:
                # Group keys by priority level
                priority_groups = {}
                async with self._data_lock:
                    for key in available_keys:
                        key_data = self._usage_data.get(key, {})

                        # Skip keys on cooldown
                        if (key_data.get("key_cooldown_until") or 0) > now or (
                            key_data.get("model_cooldowns", {}).get(model) or 0
                        ) > now:
                            continue

                        # Get priority for this key (default to 999 if not specified)
                        priority = credential_priorities.get(key, 999)

                        # Get usage count for load balancing within priority groups
                        # Uses grouped usage if model is in a quota group
                        usage_count = self._get_grouped_usage_count(key, model)

                        # Group by priority
                        if priority not in priority_groups:
                            priority_groups[priority] = []
                        priority_groups[priority].append((key, usage_count))

                # Try priority groups in order (1, 2, 3, ...)
                sorted_priorities = sorted(priority_groups.keys())

                for priority_level in sorted_priorities:
                    keys_in_priority = priority_groups[priority_level]

                    # Determine selection method based on provider's rotation mode
                    provider = model.split("/")[0] if "/" in model else ""
                    rotation_mode = self._get_rotation_mode(provider)

                    # Calculate effective concurrency based on priority tier
                    multiplier = self._get_priority_multiplier(
                        provider, priority_level, rotation_mode
                    )
                    effective_max_concurrent = max_concurrent * multiplier

                    # Within each priority group, use existing tier1/tier2 logic
                    tier1_keys, tier2_keys = [], []
                    for key, usage_count in keys_in_priority:
                        key_state = self.key_states[key]

                        # Tier 1: Completely idle keys (preferred)
                        if not key_state["models_in_use"]:
                            tier1_keys.append((key, usage_count))
                        # Tier 2: Keys that can accept more concurrent requests
                        elif (
                            key_state["models_in_use"].get(model, 0)
                            < effective_max_concurrent
                        ):
                            tier2_keys.append((key, usage_count))

                    if rotation_mode == "sequential":
                        # Sequential mode: sort credentials by priority, usage, recency
                        # Keep all candidates in sorted order (no filtering to single key)
                        selection_method = "sequential"
                        if tier1_keys:
                            tier1_keys = self._sort_sequential(
                                tier1_keys, credential_priorities
                            )
                        if tier2_keys:
                            tier2_keys = self._sort_sequential(
                                tier2_keys, credential_priorities
                            )
                    elif self.rotation_tolerance > 0:
                        # Balanced mode with weighted randomness
                        selection_method = "weighted-random"
                        if tier1_keys:
                            selected_key = self._select_weighted_random(
                                tier1_keys, self.rotation_tolerance
                            )
                            tier1_keys = [
                                (k, u) for k, u in tier1_keys if k == selected_key
                            ]
                        if tier2_keys:
                            selected_key = self._select_weighted_random(
                                tier2_keys, self.rotation_tolerance
                            )
                            tier2_keys = [
                                (k, u) for k, u in tier2_keys if k == selected_key
                            ]
                    else:
                        # Deterministic: sort by usage within each tier
                        selection_method = "least-used"
                        tier1_keys.sort(key=lambda x: x[1])
                        tier2_keys.sort(key=lambda x: x[1])

                    # Try to acquire from Tier 1 first
                    for key, usage in tier1_keys:
                        state = self.key_states[key]
                        async with state["lock"]:
                            if not state["models_in_use"]:
                                state["models_in_use"][model] = 1
                                tier_name = (
                                    credential_tier_names.get(key, "unknown")
                                    if credential_tier_names
                                    else "unknown"
                                )
                                quota_display = self._get_quota_display(key, model)
                                lib_logger.info(
                                    f"Acquired key {mask_credential(key)} for model {model} "
                                    f"(tier: {tier_name}, priority: {priority_level}, selection: {selection_method}, {quota_display})"
                                )
                                return key

                    # Then try Tier 2
                    for key, usage in tier2_keys:
                        state = self.key_states[key]
                        async with state["lock"]:
                            current_count = state["models_in_use"].get(model, 0)
                            if current_count < effective_max_concurrent:
                                state["models_in_use"][model] = current_count + 1
                                tier_name = (
                                    credential_tier_names.get(key, "unknown")
                                    if credential_tier_names
                                    else "unknown"
                                )
                                quota_display = self._get_quota_display(key, model)
                                lib_logger.info(
                                    f"Acquired key {mask_credential(key)} for model {model} "
                                    f"(tier: {tier_name}, priority: {priority_level}, selection: {selection_method}, concurrent: {state['models_in_use'][model]}/{effective_max_concurrent}, {quota_display})"
                                )
                                return key

                # If we get here, all priority groups were exhausted but keys might become available
                # Collect all keys across all priorities for waiting
                all_potential_keys = []
                for keys_list in priority_groups.values():
                    all_potential_keys.extend(keys_list)

                if not all_potential_keys:
                    lib_logger.warning(
                        "No keys are eligible (all on cooldown or filtered out). Waiting before re-evaluating."
                    )
                    await asyncio.sleep(1)
                    continue

                # Wait for the highest priority key with lowest usage
                best_priority = min(priority_groups.keys())
                best_priority_keys = priority_groups[best_priority]
                best_wait_key = min(best_priority_keys, key=lambda x: x[1])[0]
                wait_condition = self.key_states[best_wait_key]["condition"]

                lib_logger.info(
                    f"All Priority-{best_priority} keys are busy. Waiting for highest priority credential to become available..."
                )

            else:
                # Original logic when no priorities specified

                # Determine selection method based on provider's rotation mode
                provider = model.split("/")[0] if "/" in model else ""
                rotation_mode = self._get_rotation_mode(provider)

                # Calculate effective concurrency for default priority (999)
                # When no priorities are specified, all credentials get default priority
                default_priority = 999
                multiplier = self._get_priority_multiplier(
                    provider, default_priority, rotation_mode
                )
                effective_max_concurrent = max_concurrent * multiplier

                tier1_keys, tier2_keys = [], []

                # First, filter the list of available keys to exclude any on cooldown.
                async with self._data_lock:
                    for key in available_keys:
                        key_data = self._usage_data.get(key, {})

                        if (key_data.get("key_cooldown_until") or 0) > now or (
                            key_data.get("model_cooldowns", {}).get(model) or 0
                        ) > now:
                            continue

                        # Prioritize keys based on their current usage to ensure load balancing.
                        # Uses grouped usage if model is in a quota group
                        usage_count = self._get_grouped_usage_count(key, model)
                        key_state = self.key_states[key]

                        # Tier 1: Completely idle keys (preferred).
                        if not key_state["models_in_use"]:
                            tier1_keys.append((key, usage_count))
                        # Tier 2: Keys that can accept more concurrent requests for this model.
                        elif (
                            key_state["models_in_use"].get(model, 0)
                            < effective_max_concurrent
                        ):
                            tier2_keys.append((key, usage_count))

                if rotation_mode == "sequential":
                    # Sequential mode: sort credentials by priority, usage, recency
                    # Keep all candidates in sorted order (no filtering to single key)
                    selection_method = "sequential"
                    if tier1_keys:
                        tier1_keys = self._sort_sequential(
                            tier1_keys, credential_priorities
                        )
                    if tier2_keys:
                        tier2_keys = self._sort_sequential(
                            tier2_keys, credential_priorities
                        )
                elif self.rotation_tolerance > 0:
                    # Balanced mode with weighted randomness
                    selection_method = "weighted-random"
                    if tier1_keys:
                        selected_key = self._select_weighted_random(
                            tier1_keys, self.rotation_tolerance
                        )
                        tier1_keys = [
                            (k, u) for k, u in tier1_keys if k == selected_key
                        ]
                    if tier2_keys:
                        selected_key = self._select_weighted_random(
                            tier2_keys, self.rotation_tolerance
                        )
                        tier2_keys = [
                            (k, u) for k, u in tier2_keys if k == selected_key
                        ]
                else:
                    # Deterministic: sort by usage within each tier
                    selection_method = "least-used"
                    tier1_keys.sort(key=lambda x: x[1])
                    tier2_keys.sort(key=lambda x: x[1])

                # Attempt to acquire a key from Tier 1 first.
                for key, usage in tier1_keys:
                    state = self.key_states[key]
                    async with state["lock"]:
                        if not state["models_in_use"]:
                            state["models_in_use"][model] = 1
                            tier_name = (
                                credential_tier_names.get(key)
                                if credential_tier_names
                                else None
                            )
                            tier_info = f"tier: {tier_name}, " if tier_name else ""
                            quota_display = self._get_quota_display(key, model)
                            lib_logger.info(
                                f"Acquired key {mask_credential(key)} for model {model} "
                                f"({tier_info}selection: {selection_method}, {quota_display})"
                            )
                            return key

                # If no Tier 1 keys are available, try Tier 2.
                for key, usage in tier2_keys:
                    state = self.key_states[key]
                    async with state["lock"]:
                        current_count = state["models_in_use"].get(model, 0)
                        if current_count < effective_max_concurrent:
                            state["models_in_use"][model] = current_count + 1
                            tier_name = (
                                credential_tier_names.get(key)
                                if credential_tier_names
                                else None
                            )
                            tier_info = f"tier: {tier_name}, " if tier_name else ""
                            quota_display = self._get_quota_display(key, model)
                            lib_logger.info(
                                f"Acquired key {mask_credential(key)} for model {model} "
                                f"({tier_info}selection: {selection_method}, concurrent: {state['models_in_use'][model]}/{effective_max_concurrent}, {quota_display})"
                            )
                            return key

                # If all eligible keys are locked, wait for a key to be released.
                lib_logger.info(
                    "All eligible keys are currently locked for this model. Waiting..."
                )

                all_potential_keys = tier1_keys + tier2_keys
                if not all_potential_keys:
                    lib_logger.warning(
                        "No keys are eligible (all on cooldown). Waiting before re-evaluating."
                    )
                    await asyncio.sleep(1)
                    continue

                # Wait on the condition of the key with the lowest current usage.
                best_wait_key = min(all_potential_keys, key=lambda x: x[1])[0]
                wait_condition = self.key_states[best_wait_key]["condition"]

            try:
                async with wait_condition:
                    remaining_budget = deadline - time.time()
                    if remaining_budget <= 0:
                        break  # Exit if the budget has already been exceeded.
                    # Wait for a notification, but no longer than the remaining budget or 1 second.
                    await asyncio.wait_for(
                        wait_condition.wait(), timeout=min(1, remaining_budget)
                    )
                lib_logger.info("Notified that a key was released. Re-evaluating...")
            except asyncio.TimeoutError:
                # This is not an error, just a timeout for the wait. The main loop will re-evaluate.
                lib_logger.info("Wait timed out. Re-evaluating for any available key.")

        # If the loop exits, it means the deadline was exceeded.
        raise NoAvailableKeysError(
            f"Could not acquire a key for model {model} within the global time budget."
        )

    async def release_key(self, key: str, model: str):
        """Releases a key's lock for a specific model and notifies waiting tasks."""
        if key not in self.key_states:
            return

        state = self.key_states[key]
        async with state["lock"]:
            if model in state["models_in_use"]:
                state["models_in_use"][model] -= 1
                remaining = state["models_in_use"][model]
                if remaining <= 0:
                    del state["models_in_use"][model]  # Clean up when count reaches 0
                lib_logger.info(
                    f"Released credential {mask_credential(key)} from model {model} "
                    f"(remaining concurrent: {max(0, remaining)})"
                )
            else:
                lib_logger.warning(
                    f"Attempted to release credential {mask_credential(key)} for model {model}, but it was not in use."
                )

        # Notify all tasks waiting on this key's condition
        async with state["condition"]:
            state["condition"].notify_all()

    async def record_success(
        self,
        key: str,
        model: str,
        completion_response: Optional[litellm.ModelResponse] = None,
    ):
        """
        Records a successful API call, resetting failure counters.
        It safely handles cases where token usage data is not available.

        Supports two modes based on provider configuration:
        - per_model: Each model has its own window_start_ts and stats in key_data["models"]
        - credential: Legacy mode with key_data["daily"]["models"]
        """
        await self._lazy_init()
        async with self._data_lock:
            now_ts = time.time()
            today_utc_str = datetime.now(timezone.utc).date().isoformat()

            reset_config = self._get_usage_reset_config(key)
            reset_mode = (
                reset_config.get("mode", "credential") if reset_config else "credential"
            )

            if reset_mode == "per_model":
                # New per-model structure
                key_data = self._usage_data.setdefault(
                    key,
                    {
                        "models": {},
                        "global": {"models": {}},
                        "model_cooldowns": {},
                        "failures": {},
                    },
                )

                # Ensure models dict exists
                if "models" not in key_data:
                    key_data["models"] = {}

                # Get or create per-model data with window tracking
                model_data = key_data["models"].setdefault(
                    model,
                    {
                        "window_start_ts": None,
                        "quota_reset_ts": None,
                        "success_count": 0,
                        "failure_count": 0,
                        "request_count": 0,
                        "prompt_tokens": 0,
                        "completion_tokens": 0,
                        "approx_cost": 0.0,
                    },
                )

                # Start window on first request for this model
                if model_data.get("window_start_ts") is None:
                    model_data["window_start_ts"] = now_ts

                    # Set expected quota reset time from provider config
                    window_seconds = (
                        reset_config.get("window_seconds", 0) if reset_config else 0
                    )
                    if window_seconds > 0:
                        model_data["quota_reset_ts"] = now_ts + window_seconds

                    window_hours = window_seconds / 3600 if window_seconds else 0
                    lib_logger.info(
                        f"Started {window_hours:.1f}h window for model {model} on {mask_credential(key)}"
                    )

                # Record stats
                model_data["success_count"] += 1
                model_data["request_count"] = model_data.get("request_count", 0) + 1

                # Sync request_count across quota group (for providers with shared quota pools)
                new_request_count = model_data["request_count"]
                group = self._get_model_quota_group(key, model)
                if group:
                    grouped_models = self._get_grouped_models(key, group)
                    for grouped_model in grouped_models:
                        if grouped_model != model:
                            other_model_data = key_data["models"].setdefault(
                                grouped_model,
                                {
                                    "window_start_ts": None,
                                    "quota_reset_ts": None,
                                    "success_count": 0,
                                    "failure_count": 0,
                                    "request_count": 0,
                                    "prompt_tokens": 0,
                                    "completion_tokens": 0,
                                    "approx_cost": 0.0,
                                },
                            )
                            other_model_data["request_count"] = new_request_count
                            # Also sync quota_max_requests if set
                            max_req = model_data.get("quota_max_requests")
                            if max_req:
                                other_model_data["quota_max_requests"] = max_req
                                other_model_data["quota_display"] = (
                                    f"{new_request_count}/{max_req}"
                                )

                # Update quota_display if max_requests is set (Antigravity-specific)
                max_req = model_data.get("quota_max_requests")
                if max_req:
                    model_data["quota_display"] = (
                        f"{model_data['request_count']}/{max_req}"
                    )

                usage_data_ref = model_data  # For token/cost recording below

            else:
                # Legacy credential-level structure
                key_data = self._usage_data.setdefault(
                    key,
                    {
                        "daily": {"date": today_utc_str, "models": {}},
                        "global": {"models": {}},
                        "model_cooldowns": {},
                        "failures": {},
                    },
                )

                if "last_daily_reset" not in key_data:
                    key_data["last_daily_reset"] = today_utc_str

                # Get or create model data in daily structure
                usage_data_ref = key_data["daily"]["models"].setdefault(
                    model,
                    {
                        "success_count": 0,
                        "prompt_tokens": 0,
                        "completion_tokens": 0,
                        "approx_cost": 0.0,
                    },
                )
                usage_data_ref["success_count"] += 1

            # Reset failures for this model
            model_failures = key_data.setdefault("failures", {}).setdefault(model, {})
            model_failures["consecutive_failures"] = 0

            # Clear transient cooldown on success (but NOT quota_reset_ts)
            if model in key_data.get("model_cooldowns", {}):
                del key_data["model_cooldowns"][model]

            # Record token and cost usage
            if (
                completion_response
                and hasattr(completion_response, "usage")
                and completion_response.usage
            ):
                usage = completion_response.usage
                usage_data_ref["prompt_tokens"] += usage.prompt_tokens
                usage_data_ref["completion_tokens"] += getattr(
                    usage, "completion_tokens", 0
                )
                lib_logger.info(
                    f"Recorded usage from response object for key {mask_credential(key)}"
                )
                try:
                    provider_name = model.split("/")[0]
                    provider_instance = self._get_provider_instance(provider_name)

                    if provider_instance and getattr(
                        provider_instance, "skip_cost_calculation", False
                    ):
                        lib_logger.debug(
                            f"Skipping cost calculation for provider '{provider_name}' (custom provider)."
                        )
                    else:
                        if isinstance(completion_response, litellm.EmbeddingResponse):
                            model_info = litellm.get_model_info(model)
                            input_cost = model_info.get("input_cost_per_token")
                            if input_cost:
                                cost = (
                                    completion_response.usage.prompt_tokens * input_cost
                                )
                            else:
                                cost = None
                        else:
                            cost = litellm.completion_cost(
                                completion_response=completion_response, model=model
                            )

                        if cost is not None:
                            usage_data_ref["approx_cost"] += cost
                except Exception as e:
                    lib_logger.warning(
                        f"Could not calculate cost for model {model}: {e}"
                    )
            elif isinstance(completion_response, asyncio.Future) or hasattr(
                completion_response, "__aiter__"
            ):
                pass  # Stream - usage recorded from chunks
            else:
                lib_logger.warning(
                    f"No usage data found in completion response for model {model}. Recording success without token count."
                )

            key_data["last_used_ts"] = now_ts

        await self._save_usage()

    async def record_failure(
        self,
        key: str,
        model: str,
        classified_error: ClassifiedError,
        increment_consecutive_failures: bool = True,
    ):
        """Records a failure and applies cooldowns based on error type.

        Distinguishes between:
        - quota_exceeded: Long cooldown with exact reset time (from quota_reset_timestamp)
          Sets quota_reset_ts on model (and group) - this becomes authoritative stats reset time
        - rate_limit: Short transient cooldown (just wait and retry)
          Only sets model_cooldowns - does NOT affect stats reset timing

        Args:
            key: The API key or credential identifier
            model: The model name
            classified_error: The classified error object
            increment_consecutive_failures: Whether to increment the failure counter.
                Set to False for provider-level errors that shouldn't count against the key.
        """
        await self._lazy_init()
        async with self._data_lock:
            now_ts = time.time()
            today_utc_str = datetime.now(timezone.utc).date().isoformat()

            reset_config = self._get_usage_reset_config(key)
            reset_mode = (
                reset_config.get("mode", "credential") if reset_config else "credential"
            )

            # Initialize key data with appropriate structure
            if reset_mode == "per_model":
                key_data = self._usage_data.setdefault(
                    key,
                    {
                        "models": {},
                        "global": {"models": {}},
                        "model_cooldowns": {},
                        "failures": {},
                    },
                )
            else:
                key_data = self._usage_data.setdefault(
                    key,
                    {
                        "daily": {"date": today_utc_str, "models": {}},
                        "global": {"models": {}},
                        "model_cooldowns": {},
                        "failures": {},
                    },
                )

            # Provider-level errors (transient issues) should not count against the key
            provider_level_errors = {"server_error", "api_connection"}

            # Determine if we should increment the failure counter
            should_increment = (
                increment_consecutive_failures
                and classified_error.error_type not in provider_level_errors
            )

            # Calculate cooldown duration based on error type
            cooldown_seconds = None
            model_cooldowns = key_data.setdefault("model_cooldowns", {})

            if classified_error.error_type == "quota_exceeded":
                # Quota exhausted - use authoritative reset timestamp if available
                quota_reset_ts = classified_error.quota_reset_timestamp
                cooldown_seconds = classified_error.retry_after or 60

                if quota_reset_ts and reset_mode == "per_model":
                    # Set quota_reset_ts on model - this becomes authoritative stats reset time
                    models_data = key_data.setdefault("models", {})
                    model_data = models_data.setdefault(
                        model,
                        {
                            "window_start_ts": None,
                            "quota_reset_ts": None,
                            "success_count": 0,
                            "failure_count": 0,
                            "request_count": 0,
                            "prompt_tokens": 0,
                            "completion_tokens": 0,
                            "approx_cost": 0.0,
                        },
                    )
                    model_data["quota_reset_ts"] = quota_reset_ts
                    # Track failure for quota estimation (request still consumes quota)
                    model_data["failure_count"] = model_data.get("failure_count", 0) + 1
                    model_data["request_count"] = model_data.get("request_count", 0) + 1
                    new_request_count = model_data["request_count"]

                    # Apply to all models in the same quota group
                    group = self._get_model_quota_group(key, model)
                    if group:
                        grouped_models = self._get_grouped_models(key, group)
                        for grouped_model in grouped_models:
                            group_model_data = models_data.setdefault(
                                grouped_model,
                                {
                                    "window_start_ts": None,
                                    "quota_reset_ts": None,
                                    "success_count": 0,
                                    "failure_count": 0,
                                    "request_count": 0,
                                    "prompt_tokens": 0,
                                    "completion_tokens": 0,
                                    "approx_cost": 0.0,
                                },
                            )
                            group_model_data["quota_reset_ts"] = quota_reset_ts
                            # Sync request_count across quota group
                            group_model_data["request_count"] = new_request_count
                            # Also sync quota_max_requests if set
                            max_req = model_data.get("quota_max_requests")
                            if max_req:
                                group_model_data["quota_max_requests"] = max_req
                                group_model_data["quota_display"] = (
                                    f"{new_request_count}/{max_req}"
                                )
                            # Also set transient cooldown for selection logic
                            model_cooldowns[grouped_model] = quota_reset_ts

                        reset_dt = datetime.fromtimestamp(
                            quota_reset_ts, tz=timezone.utc
                        )
                        lib_logger.info(
                            f"Quota exhausted for group '{group}' ({len(grouped_models)} models) "
                            f"on {mask_credential(key)}. Resets at {reset_dt.isoformat()}"
                        )
                    else:
                        reset_dt = datetime.fromtimestamp(
                            quota_reset_ts, tz=timezone.utc
                        )
                        hours = (quota_reset_ts - now_ts) / 3600
                        lib_logger.info(
                            f"Quota exhausted for model {model} on {mask_credential(key)}. "
                            f"Resets at {reset_dt.isoformat()} ({hours:.1f}h)"
                        )

                    # Set transient cooldown for selection logic
                    model_cooldowns[model] = quota_reset_ts
                else:
                    # No authoritative timestamp or legacy mode - just use retry_after
                    model_cooldowns[model] = now_ts + cooldown_seconds
                    hours = cooldown_seconds / 3600
                    lib_logger.info(
                        f"Quota exhausted on {mask_credential(key)} for model {model}. "
                        f"Cooldown: {cooldown_seconds}s ({hours:.1f}h)"
                    )

            elif classified_error.error_type == "rate_limit":
                # Transient rate limit - just set short cooldown (does NOT set quota_reset_ts)
                cooldown_seconds = classified_error.retry_after or 60
                model_cooldowns[model] = now_ts + cooldown_seconds
                lib_logger.info(
                    f"Rate limit on {mask_credential(key)} for model {model}. "
                    f"Transient cooldown: {cooldown_seconds}s"
                )

            elif classified_error.error_type == "authentication":
                # Apply a 5-minute key-level lockout for auth errors
                key_data["key_cooldown_until"] = now_ts + 300
                cooldown_seconds = 300
                model_cooldowns[model] = now_ts + cooldown_seconds
                lib_logger.warning(
                    f"Authentication error on key {mask_credential(key)}. Applying 5-minute key-level lockout."
                )

            # If we should increment failures, calculate escalating backoff
            if should_increment:
                failures_data = key_data.setdefault("failures", {})
                model_failures = failures_data.setdefault(
                    model, {"consecutive_failures": 0}
                )
                model_failures["consecutive_failures"] += 1
                count = model_failures["consecutive_failures"]

                # If cooldown wasn't set by specific error type, use escalating backoff
                if cooldown_seconds is None:
                    backoff_tiers = {1: 10, 2: 30, 3: 60, 4: 120}
                    cooldown_seconds = backoff_tiers.get(count, 7200)
                    model_cooldowns[model] = now_ts + cooldown_seconds
                    lib_logger.warning(
                        f"Failure #{count} for key {mask_credential(key)} with model {model}. "
                        f"Error type: {classified_error.error_type}, cooldown: {cooldown_seconds}s"
                    )
            else:
                # Provider-level errors: apply short cooldown but don't count against key
                if cooldown_seconds is None:
                    cooldown_seconds = 30
                    model_cooldowns[model] = now_ts + cooldown_seconds
                lib_logger.info(
                    f"Provider-level error ({classified_error.error_type}) for key {mask_credential(key)} "
                    f"with model {model}. NOT incrementing failures. Cooldown: {cooldown_seconds}s"
                )

            # Check for key-level lockout condition
            await self._check_key_lockout(key, key_data)

            # Track failure count for quota estimation (all failures consume quota)
            # This is separate from consecutive_failures which is for backoff logic
            if reset_mode == "per_model":
                models_data = key_data.setdefault("models", {})
                model_data = models_data.setdefault(
                    model,
                    {
                        "window_start_ts": None,
                        "quota_reset_ts": None,
                        "success_count": 0,
                        "failure_count": 0,
                        "request_count": 0,
                        "prompt_tokens": 0,
                        "completion_tokens": 0,
                        "approx_cost": 0.0,
                    },
                )
                # Only increment if not already incremented in quota_exceeded branch
                if classified_error.error_type != "quota_exceeded":
                    model_data["failure_count"] = model_data.get("failure_count", 0) + 1
                    model_data["request_count"] = model_data.get("request_count", 0) + 1

                    # Sync request_count across quota group
                    new_request_count = model_data["request_count"]
                    group = self._get_model_quota_group(key, model)
                    if group:
                        grouped_models = self._get_grouped_models(key, group)
                        for grouped_model in grouped_models:
                            if grouped_model != model:
                                other_model_data = models_data.setdefault(
                                    grouped_model,
                                    {
                                        "window_start_ts": None,
                                        "quota_reset_ts": None,
                                        "success_count": 0,
                                        "failure_count": 0,
                                        "request_count": 0,
                                        "prompt_tokens": 0,
                                        "completion_tokens": 0,
                                        "approx_cost": 0.0,
                                    },
                                )
                                other_model_data["request_count"] = new_request_count
                                # Also sync quota_max_requests if set
                                max_req = model_data.get("quota_max_requests")
                                if max_req:
                                    other_model_data["quota_max_requests"] = max_req
                                    other_model_data["quota_display"] = (
                                        f"{new_request_count}/{max_req}"
                                    )

            key_data["last_failure"] = {
                "timestamp": now_ts,
                "model": model,
                "error": str(classified_error.original_exception),
            }

        await self._save_usage()

    async def update_quota_baseline(
        self,
        credential: str,
        model: str,
        remaining_fraction: float,
        max_requests: Optional[int] = None,
    ) -> None:
        """
        Update quota baseline data for a credential/model after fetching from API.

        This stores the current quota state as a baseline, which is used to
        estimate remaining quota based on subsequent request counts.

        Args:
            credential: Credential identifier (file path or env:// URI)
            model: Model name (with or without provider prefix)
            remaining_fraction: Current remaining quota as fraction (0.0 to 1.0)
            max_requests: Maximum requests allowed per quota period (e.g., 250 for Claude)
        """
        await self._lazy_init()
        async with self._data_lock:
            now_ts = time.time()

            # Get or create key data structure
            key_data = self._usage_data.setdefault(
                credential,
                {
                    "models": {},
                    "global": {"models": {}},
                    "model_cooldowns": {},
                    "failures": {},
                },
            )

            # Ensure models dict exists
            if "models" not in key_data:
                key_data["models"] = {}

            # Get or create per-model data
            model_data = key_data["models"].setdefault(
                model,
                {
                    "window_start_ts": None,
                    "quota_reset_ts": None,
                    "success_count": 0,
                    "failure_count": 0,
                    "request_count": 0,
                    "prompt_tokens": 0,
                    "completion_tokens": 0,
                    "approx_cost": 0.0,
                    "baseline_remaining_fraction": None,
                    "baseline_fetched_at": None,
                    "requests_at_baseline": None,
                },
            )

            # Calculate actual used requests from API's remaining fraction
            # The API is authoritative - sync our local count to match reality
            if max_requests is not None:
                used_requests = int((1.0 - remaining_fraction) * max_requests)
            else:
                # Estimate max_requests from provider's quota cost
                # This matches how get_max_requests_for_model() calculates it
                provider = self._get_provider_from_credential(credential)
                plugin_instance = self._get_provider_instance(provider)
                if plugin_instance and hasattr(
                    plugin_instance, "get_max_requests_for_model"
                ):
                    # Get tier from provider's cache
                    tier = getattr(plugin_instance, "project_tier_cache", {}).get(
                        credential, "standard-tier"
                    )
                    # Strip provider prefix from model if present
                    clean_model = model.split("/")[-1] if "/" in model else model
                    max_requests = plugin_instance.get_max_requests_for_model(
                        clean_model, tier
                    )
                    used_requests = int((1.0 - remaining_fraction) * max_requests)
                else:
                    # Fallback: keep existing count if we can't calculate
                    used_requests = model_data.get("request_count", 0)
                    max_requests = model_data.get("quota_max_requests")

            # Sync local request count to API's authoritative value
            model_data["request_count"] = used_requests
            model_data["requests_at_baseline"] = used_requests

            # Update baseline fields
            model_data["baseline_remaining_fraction"] = remaining_fraction
            model_data["baseline_fetched_at"] = now_ts

            # Update max_requests and quota_display
            if max_requests is not None:
                model_data["quota_max_requests"] = max_requests
                model_data["quota_display"] = f"{used_requests}/{max_requests}"

            # Sync request_count and quota_max_requests across quota group
            group = self._get_model_quota_group(credential, model)
            if group:
                grouped_models = self._get_grouped_models(credential, group)
                for grouped_model in grouped_models:
                    if grouped_model != model:
                        other_model_data = key_data["models"].setdefault(
                            grouped_model,
                            {
                                "window_start_ts": None,
                                "quota_reset_ts": None,
                                "success_count": 0,
                                "failure_count": 0,
                                "request_count": 0,
                                "prompt_tokens": 0,
                                "completion_tokens": 0,
                                "approx_cost": 0.0,
                            },
                        )
                        other_model_data["request_count"] = used_requests
                        if max_requests is not None:
                            other_model_data["quota_max_requests"] = max_requests
                            other_model_data["quota_display"] = (
                                f"{used_requests}/{max_requests}"
                            )

            lib_logger.debug(
                f"Updated quota baseline for {mask_credential(credential)} model={model}: "
                f"remaining={remaining_fraction:.2%}, synced_request_count={used_requests}"
            )

        await self._save_usage()

    async def _check_key_lockout(self, key: str, key_data: Dict):
        """
        Checks if a key should be locked out due to multiple model failures.

        NOTE: This check is currently disabled. The original logic counted individual
        models in long-term lockout, but this caused issues with quota groups - when
        a single quota group (e.g., "claude" with 5 models) was exhausted, it would
        count as 5 lockouts and trigger key-level lockout, blocking other quota groups
        (like gemini) that were still available.

        The per-model and per-group cooldowns already handle quota exhaustion properly.
        """
        # Disabled - see docstring above
        pass

    async def get_stats_for_endpoint(
        self,
        provider_filter: Optional[str] = None,
        include_global: bool = True,
    ) -> Dict[str, Any]:
        """
        Get usage stats formatted for the /v1/quota-stats endpoint.

        Aggregates data from key_usage.json grouped by provider.
        Includes both current period stats and global (lifetime) stats.

        Args:
            provider_filter: If provided, only return stats for this provider
            include_global: If True, include global/lifetime stats alongside current

        Returns:
            {
                "providers": {
                    "provider_name": {
                        "credential_count": int,
                        "active_count": int,
                        "on_cooldown_count": int,
                        "total_requests": int,
                        "tokens": {
                            "input_cached": int,
                            "input_uncached": int,
                            "input_cache_pct": float,
                            "output": int
                        },
                        "approx_cost": float | None,
                        "credentials": [...],
                        "global": {...}  # If include_global is True
                    }
                },
                "summary": {...},
                "global_summary": {...},  # If include_global is True
                "timestamp": float
            }
        """
        await self._lazy_init()

        now_ts = time.time()
        providers: Dict[str, Dict[str, Any]] = {}
        # Track global stats separately
        global_providers: Dict[str, Dict[str, Any]] = {}

        async with self._data_lock:
            if not self._usage_data:
                return {
                    "providers": {},
                    "summary": {
                        "total_providers": 0,
                        "total_credentials": 0,
                        "active_credentials": 0,
                        "exhausted_credentials": 0,
                        "total_requests": 0,
                        "tokens": {
                            "input_cached": 0,
                            "input_uncached": 0,
                            "input_cache_pct": 0,
                            "output": 0,
                        },
                        "approx_total_cost": 0.0,
                    },
                    "global_summary": {
                        "total_providers": 0,
                        "total_credentials": 0,
                        "total_requests": 0,
                        "tokens": {
                            "input_cached": 0,
                            "input_uncached": 0,
                            "input_cache_pct": 0,
                            "output": 0,
                        },
                        "approx_total_cost": 0.0,
                    },
                    "data_source": "cache",
                    "timestamp": now_ts,
                }

            for credential, cred_data in self._usage_data.items():
                # Extract provider from credential path
                provider = self._get_provider_from_credential(credential)
                if not provider:
                    continue

                # Apply filter if specified
                if provider_filter and provider != provider_filter:
                    continue

                # Initialize provider entry
                if provider not in providers:
                    providers[provider] = {
                        "credential_count": 0,
                        "active_count": 0,
                        "on_cooldown_count": 0,
                        "exhausted_count": 0,
                        "total_requests": 0,
                        "tokens": {
                            "input_cached": 0,
                            "input_uncached": 0,
                            "input_cache_pct": 0,
                            "output": 0,
                        },
                        "approx_cost": 0.0,
                        "credentials": [],
                    }
                    global_providers[provider] = {
                        "total_requests": 0,
                        "tokens": {
                            "input_cached": 0,
                            "input_uncached": 0,
                            "input_cache_pct": 0,
                            "output": 0,
                        },
                        "approx_cost": 0.0,
                    }

                prov_stats = providers[provider]
                prov_stats["credential_count"] += 1

                # Determine credential status and cooldowns
                key_cooldown = cred_data.get("key_cooldown_until", 0) or 0
                model_cooldowns = cred_data.get("model_cooldowns", {})

                # Build active cooldowns with remaining time
                active_cooldowns = {}
                for model, cooldown_ts in model_cooldowns.items():
                    if cooldown_ts > now_ts:
                        remaining_seconds = int(cooldown_ts - now_ts)
                        active_cooldowns[model] = {
                            "until_ts": cooldown_ts,
                            "remaining_seconds": remaining_seconds,
                        }

                key_cooldown_remaining = None
                if key_cooldown > now_ts:
                    key_cooldown_remaining = int(key_cooldown - now_ts)

                has_active_cooldown = key_cooldown > now_ts or len(active_cooldowns) > 0

                # Check if exhausted (all quota groups exhausted for Antigravity)
                is_exhausted = False
                models_data = cred_data.get("models", {})
                if models_data:
                    # Check if any model has remaining quota
                    all_exhausted = True
                    for model_stats in models_data.values():
                        if isinstance(model_stats, dict):
                            baseline = model_stats.get("baseline_remaining_fraction")
                            if baseline is None or baseline > 0:
                                all_exhausted = False
                                break
                    if all_exhausted and len(models_data) > 0:
                        is_exhausted = True

                if is_exhausted:
                    prov_stats["exhausted_count"] += 1
                    status = "exhausted"
                elif has_active_cooldown:
                    prov_stats["on_cooldown_count"] += 1
                    status = "cooldown"
                else:
                    prov_stats["active_count"] += 1
                    status = "active"

                # Aggregate token stats (current period)
                cred_tokens = {
                    "input_cached": 0,
                    "input_uncached": 0,
                    "output": 0,
                }
                cred_requests = 0
                cred_cost = 0.0

                # Aggregate global token stats
                cred_global_tokens = {
                    "input_cached": 0,
                    "input_uncached": 0,
                    "output": 0,
                }
                cred_global_requests = 0
                cred_global_cost = 0.0

                # Handle per-model structure (current period)
                if models_data:
                    for model_name, model_stats in models_data.items():
                        if not isinstance(model_stats, dict):
                            continue
                        # Prefer request_count if available and non-zero, else fall back to success+failure
                        req_count = model_stats.get("request_count", 0)
                        if req_count > 0:
                            cred_requests += req_count
                        else:
                            cred_requests += model_stats.get("success_count", 0)
                            cred_requests += model_stats.get("failure_count", 0)
                        # Token stats - track cached separately
                        cred_tokens["input_cached"] += model_stats.get(
                            "prompt_tokens_cached", 0
                        )
                        cred_tokens["input_uncached"] += model_stats.get(
                            "prompt_tokens", 0
                        )
                        cred_tokens["output"] += model_stats.get("completion_tokens", 0)
                        cred_cost += model_stats.get("approx_cost", 0.0)

                # Handle legacy daily structure
                daily_data = cred_data.get("daily", {})
                daily_models = daily_data.get("models", {})
                for model_name, model_stats in daily_models.items():
                    if not isinstance(model_stats, dict):
                        continue
                    cred_requests += model_stats.get("success_count", 0)
                    cred_tokens["input_cached"] += model_stats.get(
                        "prompt_tokens_cached", 0
                    )
                    cred_tokens["input_uncached"] += model_stats.get("prompt_tokens", 0)
                    cred_tokens["output"] += model_stats.get("completion_tokens", 0)
                    cred_cost += model_stats.get("approx_cost", 0.0)

                # Handle global stats
                global_data = cred_data.get("global", {})
                global_models = global_data.get("models", {})
                for model_name, model_stats in global_models.items():
                    if not isinstance(model_stats, dict):
                        continue
                    cred_global_requests += model_stats.get("success_count", 0)
                    cred_global_tokens["input_cached"] += model_stats.get(
                        "prompt_tokens_cached", 0
                    )
                    cred_global_tokens["input_uncached"] += model_stats.get(
                        "prompt_tokens", 0
                    )
                    cred_global_tokens["output"] += model_stats.get(
                        "completion_tokens", 0
                    )
                    cred_global_cost += model_stats.get("approx_cost", 0.0)

                # Add current period stats to global totals
                cred_global_requests += cred_requests
                cred_global_tokens["input_cached"] += cred_tokens["input_cached"]
                cred_global_tokens["input_uncached"] += cred_tokens["input_uncached"]
                cred_global_tokens["output"] += cred_tokens["output"]
                cred_global_cost += cred_cost

                # Build credential entry
                # Mask credential identifier for display
                if credential.startswith("env://"):
                    identifier = credential
                else:
                    identifier = Path(credential).name

                cred_entry = {
                    "identifier": identifier,
                    "full_path": credential,
                    "status": status,
                    "last_used_ts": cred_data.get("last_used_ts"),
                    "requests": cred_requests,
                    "tokens": cred_tokens,
                    "approx_cost": cred_cost if cred_cost > 0 else None,
                }

                # Add cooldown info
                if key_cooldown_remaining is not None:
                    cred_entry["key_cooldown_remaining"] = key_cooldown_remaining
                if active_cooldowns:
                    cred_entry["model_cooldowns"] = active_cooldowns

                # Add global stats for this credential
                if include_global:
                    # Calculate global cache percentage
                    global_total_input = (
                        cred_global_tokens["input_cached"]
                        + cred_global_tokens["input_uncached"]
                    )
                    global_cache_pct = (
                        round(
                            cred_global_tokens["input_cached"]
                            / global_total_input
                            * 100,
                            1,
                        )
                        if global_total_input > 0
                        else 0
                    )

                    cred_entry["global"] = {
                        "requests": cred_global_requests,
                        "tokens": {
                            "input_cached": cred_global_tokens["input_cached"],
                            "input_uncached": cred_global_tokens["input_uncached"],
                            "input_cache_pct": global_cache_pct,
                            "output": cred_global_tokens["output"],
                        },
                        "approx_cost": cred_global_cost
                        if cred_global_cost > 0
                        else None,
                    }

                # Add model-specific data for providers with per-model tracking
                if models_data:
                    cred_entry["models"] = {}
                    for model_name, model_stats in models_data.items():
                        if not isinstance(model_stats, dict):
                            continue
                        cred_entry["models"][model_name] = {
                            "requests": model_stats.get("success_count", 0)
                            + model_stats.get("failure_count", 0),
                            "request_count": model_stats.get("request_count", 0),
                            "success_count": model_stats.get("success_count", 0),
                            "failure_count": model_stats.get("failure_count", 0),
                            "prompt_tokens": model_stats.get("prompt_tokens", 0),
                            "prompt_tokens_cached": model_stats.get(
                                "prompt_tokens_cached", 0
                            ),
                            "completion_tokens": model_stats.get(
                                "completion_tokens", 0
                            ),
                            "approx_cost": model_stats.get("approx_cost", 0.0),
                            "window_start_ts": model_stats.get("window_start_ts"),
                            "quota_reset_ts": model_stats.get("quota_reset_ts"),
                            # Quota baseline fields (Antigravity-specific)
                            "baseline_remaining_fraction": model_stats.get(
                                "baseline_remaining_fraction"
                            ),
                            "baseline_fetched_at": model_stats.get(
                                "baseline_fetched_at"
                            ),
                            "quota_max_requests": model_stats.get("quota_max_requests"),
                            "quota_display": model_stats.get("quota_display"),
                        }

                prov_stats["credentials"].append(cred_entry)

                # Aggregate to provider totals (current period)
                prov_stats["total_requests"] += cred_requests
                prov_stats["tokens"]["input_cached"] += cred_tokens["input_cached"]
                prov_stats["tokens"]["input_uncached"] += cred_tokens["input_uncached"]
                prov_stats["tokens"]["output"] += cred_tokens["output"]
                if cred_cost > 0:
                    prov_stats["approx_cost"] += cred_cost

                # Aggregate to global provider totals
                global_providers[provider]["total_requests"] += cred_global_requests
                global_providers[provider]["tokens"]["input_cached"] += (
                    cred_global_tokens["input_cached"]
                )
                global_providers[provider]["tokens"]["input_uncached"] += (
                    cred_global_tokens["input_uncached"]
                )
                global_providers[provider]["tokens"]["output"] += cred_global_tokens[
                    "output"
                ]
                global_providers[provider]["approx_cost"] += cred_global_cost

        # Calculate cache percentages for each provider
        for provider, prov_stats in providers.items():
            total_input = (
                prov_stats["tokens"]["input_cached"]
                + prov_stats["tokens"]["input_uncached"]
            )
            if total_input > 0:
                prov_stats["tokens"]["input_cache_pct"] = round(
                    prov_stats["tokens"]["input_cached"] / total_input * 100, 1
                )
            # Set cost to None if 0
            if prov_stats["approx_cost"] == 0:
                prov_stats["approx_cost"] = None

            # Calculate global cache percentages
            if include_global and provider in global_providers:
                gp = global_providers[provider]
                global_total = (
                    gp["tokens"]["input_cached"] + gp["tokens"]["input_uncached"]
                )
                if global_total > 0:
                    gp["tokens"]["input_cache_pct"] = round(
                        gp["tokens"]["input_cached"] / global_total * 100, 1
                    )
                if gp["approx_cost"] == 0:
                    gp["approx_cost"] = None
                prov_stats["global"] = gp

        # Build summary (current period)
        total_creds = sum(p["credential_count"] for p in providers.values())
        active_creds = sum(p["active_count"] for p in providers.values())
        exhausted_creds = sum(p["exhausted_count"] for p in providers.values())
        total_requests = sum(p["total_requests"] for p in providers.values())
        total_input_cached = sum(
            p["tokens"]["input_cached"] for p in providers.values()
        )
        total_input_uncached = sum(
            p["tokens"]["input_uncached"] for p in providers.values()
        )
        total_output = sum(p["tokens"]["output"] for p in providers.values())
        total_cost = sum(p["approx_cost"] or 0 for p in providers.values())

        total_input = total_input_cached + total_input_uncached
        input_cache_pct = (
            round(total_input_cached / total_input * 100, 1) if total_input > 0 else 0
        )

        result = {
            "providers": providers,
            "summary": {
                "total_providers": len(providers),
                "total_credentials": total_creds,
                "active_credentials": active_creds,
                "exhausted_credentials": exhausted_creds,
                "total_requests": total_requests,
                "tokens": {
                    "input_cached": total_input_cached,
                    "input_uncached": total_input_uncached,
                    "input_cache_pct": input_cache_pct,
                    "output": total_output,
                },
                "approx_total_cost": total_cost if total_cost > 0 else None,
            },
            "data_source": "cache",
            "timestamp": now_ts,
        }

        # Build global summary
        if include_global:
            global_total_requests = sum(
                gp["total_requests"] for gp in global_providers.values()
            )
            global_total_input_cached = sum(
                gp["tokens"]["input_cached"] for gp in global_providers.values()
            )
            global_total_input_uncached = sum(
                gp["tokens"]["input_uncached"] for gp in global_providers.values()
            )
            global_total_output = sum(
                gp["tokens"]["output"] for gp in global_providers.values()
            )
            global_total_cost = sum(
                gp["approx_cost"] or 0 for gp in global_providers.values()
            )

            global_total_input = global_total_input_cached + global_total_input_uncached
            global_input_cache_pct = (
                round(global_total_input_cached / global_total_input * 100, 1)
                if global_total_input > 0
                else 0
            )

            result["global_summary"] = {
                "total_providers": len(global_providers),
                "total_credentials": total_creds,
                "total_requests": global_total_requests,
                "tokens": {
                    "input_cached": global_total_input_cached,
                    "input_uncached": global_total_input_uncached,
                    "input_cache_pct": global_input_cache_pct,
                    "output": global_total_output,
                },
                "approx_total_cost": global_total_cost
                if global_total_cost > 0
                else None,
            }

        return result

    async def reload_from_disk(self) -> None:
        """
        Force reload usage data from disk.

        Useful when another process may have updated the file.
        """
        async with self._init_lock:
            self._initialized.clear()
            await self._load_usage()
            await self._reset_daily_stats_if_needed()
            self._initialized.set()