Spaces:

elmerzole
/

llm-api-proxy

Paused

Mirrowel commited on Dec 16, 2025

Commit

1ac7bd0

1 Parent(s): 65fe549

refactor(usage): 🔨 sync request_count across quota group models

Refactor quota group handling to synchronize request_count across all models
in a quota group instead of recalculating it on each access. This improves
performance and ensures consistency.

The previous implementation iterated through all models in a group to sum
request counts and find baselines on each quota check. The new implementation
proactively syncs these values when requests are recorded.

Changes:
- Sync request_count to all models in quota group when recording successes,
failures, or updating quota baselines
- Quota estimation now reads values from any representative model since all
models in the group are guaranteed to have synchronized data
- Add credential filename parsing support for OAuth files without paths

Also in this commit:
- chore(gitignore): ignore quota_viewer_config.json

Files changed (3) hide show

.gitignore +2 -0
src/rotator_library/providers/utilities/antigravity_quota_tracker.py +29 -37
src/rotator_library/usage_manager.py +114 -9

.gitignore CHANGED Viewed

@@ -124,9 +124,11 @@ start_proxy.bat
 key_usage.json
 staged_changes.txt
 launcher_config.json
 cache/antigravity/thought_signatures.json
 logs/
 cache/
 *.env
 oauth_creds/

 key_usage.json
 staged_changes.txt
 launcher_config.json
+quota_viewer_config.json
 cache/antigravity/thought_signatures.json
 logs/
 cache/
 *.env
 oauth_creds/

src/rotator_library/providers/utilities/antigravity_quota_tracker.py CHANGED Viewed

@@ -651,41 +651,31 @@ class AntigravityQuotaTracker:
                         cred_usage = usage_data[cred_path]
                         models_usage = cred_usage.get("models", {})
-                        # Sum up request counts across all models in group
-                        total_requests = 0
-                        baseline_remaining = None
-                        baseline_fetched_at = None
-                        reset_time_iso = None
-                        for gm in group_models:
-                            # Try with and without provider prefix
-                            prefixed_model = f"antigravity/{gm}"
-                            model_usage = models_usage.get(
-                                prefixed_model
-                            ) or models_usage.get(gm, {})
-                            total_requests += model_usage.get("request_count", 0)
-                            # Use the first available baseline
-                            if baseline_remaining is None:
-                                baseline_remaining = model_usage.get(
-                                    "baseline_remaining_fraction"
-                                )
-                                baseline_fetched_at = model_usage.get(
-                                    "baseline_fetched_at"
-                                )
-                            # Use earliest reset time
-                            if model_usage.get("quota_reset_ts"):
-                                ts = model_usage["quota_reset_ts"]
-                                try:
-                                    iso = datetime.fromtimestamp(
-                                        ts, tz=timezone.utc
-                                    ).isoformat()
-                                    if reset_time_iso is None or iso < reset_time_iso:
-                                        reset_time_iso = iso
-                                except (ValueError, OSError):
-                                    pass
                         # Calculate estimate
                         # cost_per_request is in percentage (0.4 = 0.4%), convert to fraction
@@ -693,9 +683,11 @@ class AntigravityQuotaTracker:
                             group_models[0], tier
                         )
                         cost_per_request_fraction = cost_per_request_percent / 100.0
-                        max_requests = self.get_max_requests_for_model(
-                            group_models[0], tier
-                        )
                         if baseline_remaining is not None:
                             estimated_remaining = baseline_remaining - (

                         cred_usage = usage_data[cred_path]
                         models_usage = cred_usage.get("models", {})
+                        # Get request_count from representative model (synced across group)
+                        # Try with and without provider prefix for first model in group
+                        representative_model = group_models[0]
+                        prefixed_model = f"antigravity/{representative_model}"
+                        model_usage = models_usage.get(
+                            prefixed_model
+                        ) or models_usage.get(representative_model, {})
+                        total_requests = model_usage.get("request_count", 0)
+                        baseline_remaining = model_usage.get(
+                            "baseline_remaining_fraction"
+                        )
+                        baseline_fetched_at = model_usage.get("baseline_fetched_at")
+                        max_requests = model_usage.get("quota_max_requests")
+                        # Get reset time from any model in group (also synced)
+                        reset_time_iso = None
+                        if model_usage.get("quota_reset_ts"):
+                            ts = model_usage["quota_reset_ts"]
+                            try:
+                                reset_time_iso = datetime.fromtimestamp(
+                                    ts, tz=timezone.utc
+                                ).isoformat()
+                            except (ValueError, OSError):
+                                pass
                         # Calculate estimate
                         # cost_per_request is in percentage (0.4 = 0.4%), convert to fraction
                             group_models[0], tier
                         )
                         cost_per_request_fraction = cost_per_request_percent / 100.0
+                        # Use max_requests from usage data if available, otherwise calculate
+                        if max_requests is None:
+                            max_requests = self.get_max_requests_for_model(
+                                group_models[0], tier
+                            )
                         if baseline_remaining is not None:
                             estimated_remaining = baseline_remaining - (

src/rotator_library/usage_manager.py CHANGED Viewed

@@ -186,6 +186,7 @@ class UsageManager:
         Supports multiple credential formats:
         - OAuth: "oauth_creds/antigravity_oauth_15.json" -> "antigravity"
         - OAuth: "C:\\...\\oauth_creds\\gemini_cli_oauth_1.json" -> "gemini_cli"
         - API key style: stored with provider prefix metadata
         Args:
@@ -199,7 +200,7 @@ class UsageManager:
         # Normalize path separators
         normalized = credential.replace("\\", "/")
-        # Pattern: {provider}_oauth_{number}.json
         match = re.search(r"/([a-z_]+)_oauth_\d+\.json$", normalized, re.IGNORECASE)
         if match:
             return match.group(1).lower()
@@ -209,6 +210,11 @@ class UsageManager:
         if match:
             return match.group(1).lower()
         return None
     def _get_provider_instance(self, provider: str) -> Optional[Any]:
@@ -337,22 +343,20 @@ class UsageManager:
         """
         Get usage count for credential selection, considering quota groups.
-        If the model belongs to a quota group, returns the weighted combined usage
-        across all models in the group. Otherwise returns individual model usage.
-        Weights are applied per-model to account for models that consume more quota
-        per request (e.g., Opus might count 2x compared to Sonnet).
         For providers in _REQUEST_COUNT_PROVIDERS (e.g., antigravity), uses
         request_count instead of success_count since failed requests also
         consume quota.
         Args:
             key: Credential identifier
             model: Model name (with provider prefix, e.g., "antigravity/claude-sonnet-4-5")
         Returns:
-            Weighted combined usage if grouped, otherwise individual model usage
         """
         # Determine usage field based on provider
         # Some providers (antigravity) count failed requests against quota
@@ -363,7 +367,14 @@ class UsageManager:
             else "success_count"
         )
-        # Check if model is in a quota group
         group = self._get_model_quota_group(key, model)
         if group:
@@ -1571,6 +1582,35 @@ class UsageManager:
                 model_data["success_count"] += 1
                 model_data["request_count"] = model_data.get("request_count", 0) + 1
                 # Update quota_display if max_requests is set (Antigravity-specific)
                 max_req = model_data.get("quota_max_requests")
                 if max_req:
@@ -1765,6 +1805,7 @@ class UsageManager:
                     # Track failure for quota estimation (request still consumes quota)
                     model_data["failure_count"] = model_data.get("failure_count", 0) + 1
                     model_data["request_count"] = model_data.get("request_count", 0) + 1
                     # Apply to all models in the same quota group
                     group = self._get_model_quota_group(key, model)
@@ -1785,6 +1826,15 @@ class UsageManager:
                                 },
                             )
                             group_model_data["quota_reset_ts"] = quota_reset_ts
                             # Also set transient cooldown for selection logic
                             model_cooldowns[grouped_model] = quota_reset_ts
@@ -1887,6 +1937,35 @@ class UsageManager:
                     model_data["failure_count"] = model_data.get("failure_count", 0) + 1
                     model_data["request_count"] = model_data.get("request_count", 0) + 1
             key_data["last_failure"] = {
                 "timestamp": now_ts,
                 "model": model,
@@ -1991,6 +2070,32 @@ class UsageManager:
                 model_data["quota_max_requests"] = max_requests
                 model_data["quota_display"] = f"{used_requests}/{max_requests}"
             lib_logger.debug(
                 f"Updated quota baseline for {mask_credential(credential)} model={model}: "
                 f"remaining={remaining_fraction:.2%}, synced_request_count={used_requests}"

         Supports multiple credential formats:
         - OAuth: "oauth_creds/antigravity_oauth_15.json" -> "antigravity"
         - OAuth: "C:\\...\\oauth_creds\\gemini_cli_oauth_1.json" -> "gemini_cli"
+        - OAuth filename only: "antigravity_oauth_1.json" -> "antigravity"
         - API key style: stored with provider prefix metadata
         Args:
         # Normalize path separators
         normalized = credential.replace("\\", "/")
+        # Pattern: path ending with {provider}_oauth_{number}.json
         match = re.search(r"/([a-z_]+)_oauth_\d+\.json$", normalized, re.IGNORECASE)
         if match:
             return match.group(1).lower()
         if match:
             return match.group(1).lower()
+        # Pattern: filename only {provider}_oauth_{number}.json (no path)
+        match = re.match(r"([a-z_]+)_oauth_\d+\.json$", normalized, re.IGNORECASE)
+        if match:
+            return match.group(1).lower()
         return None
     def _get_provider_instance(self, provider: str) -> Optional[Any]:
         """
         Get usage count for credential selection, considering quota groups.
         For providers in _REQUEST_COUNT_PROVIDERS (e.g., antigravity), uses
         request_count instead of success_count since failed requests also
         consume quota.
+        If the model belongs to a quota group, the request_count is already
+        synced across all models in the group (by record_success/record_failure),
+        so we just read from the requested model directly.
         Args:
             key: Credential identifier
             model: Model name (with provider prefix, e.g., "antigravity/claude-sonnet-4-5")
         Returns:
+            Usage count for the model (synced across group if applicable)
         """
         # Determine usage field based on provider
         # Some providers (antigravity) count failed requests against quota
             else "success_count"
         )
+        # For providers with synced quota groups (antigravity), request_count
+        # is already synced across all models in the group, so just read directly.
+        # For other providers, we still need to sum success_count across group.
+        if provider in self._REQUEST_COUNT_PROVIDERS:
+            # request_count is synced - just read the model's value
+            return self._get_usage_count(key, model, usage_field)
+        # For non-synced providers, check if model is in a quota group and sum
         group = self._get_model_quota_group(key, model)
         if group:
                 model_data["success_count"] += 1
                 model_data["request_count"] = model_data.get("request_count", 0) + 1
+                # Sync request_count across quota group (for providers with shared quota pools)
+                new_request_count = model_data["request_count"]
+                group = self._get_model_quota_group(key, model)
+                if group:
+                    grouped_models = self._get_grouped_models(key, group)
+                    for grouped_model in grouped_models:
+                        if grouped_model != model:
+                            other_model_data = key_data["models"].setdefault(
+                                grouped_model,
+                                {
+                                    "window_start_ts": None,
+                                    "quota_reset_ts": None,
+                                    "success_count": 0,
+                                    "failure_count": 0,
+                                    "request_count": 0,
+                                    "prompt_tokens": 0,
+                                    "completion_tokens": 0,
+                                    "approx_cost": 0.0,
+                                },
+                            )
+                            other_model_data["request_count"] = new_request_count
+                            # Also sync quota_max_requests if set
+                            max_req = model_data.get("quota_max_requests")
+                            if max_req:
+                                other_model_data["quota_max_requests"] = max_req
+                                other_model_data["quota_display"] = (
+                                    f"{new_request_count}/{max_req}"
+                                )
                 # Update quota_display if max_requests is set (Antigravity-specific)
                 max_req = model_data.get("quota_max_requests")
                 if max_req:
                     # Track failure for quota estimation (request still consumes quota)
                     model_data["failure_count"] = model_data.get("failure_count", 0) + 1
                     model_data["request_count"] = model_data.get("request_count", 0) + 1
+                    new_request_count = model_data["request_count"]
                     # Apply to all models in the same quota group
                     group = self._get_model_quota_group(key, model)
                                 },
                             )
                             group_model_data["quota_reset_ts"] = quota_reset_ts
+                            # Sync request_count across quota group
+                            group_model_data["request_count"] = new_request_count
+                            # Also sync quota_max_requests if set
+                            max_req = model_data.get("quota_max_requests")
+                            if max_req:
+                                group_model_data["quota_max_requests"] = max_req
+                                group_model_data["quota_display"] = (
+                                    f"{new_request_count}/{max_req}"
+                                )
                             # Also set transient cooldown for selection logic
                             model_cooldowns[grouped_model] = quota_reset_ts
                     model_data["failure_count"] = model_data.get("failure_count", 0) + 1
                     model_data["request_count"] = model_data.get("request_count", 0) + 1
+                    # Sync request_count across quota group
+                    new_request_count = model_data["request_count"]
+                    group = self._get_model_quota_group(key, model)
+                    if group:
+                        grouped_models = self._get_grouped_models(key, group)
+                        for grouped_model in grouped_models:
+                            if grouped_model != model:
+                                other_model_data = models_data.setdefault(
+                                    grouped_model,
+                                    {
+                                        "window_start_ts": None,
+                                        "quota_reset_ts": None,
+                                        "success_count": 0,
+                                        "failure_count": 0,
+                                        "request_count": 0,
+                                        "prompt_tokens": 0,
+                                        "completion_tokens": 0,
+                                        "approx_cost": 0.0,
+                                    },
+                                )
+                                other_model_data["request_count"] = new_request_count
+                                # Also sync quota_max_requests if set
+                                max_req = model_data.get("quota_max_requests")
+                                if max_req:
+                                    other_model_data["quota_max_requests"] = max_req
+                                    other_model_data["quota_display"] = (
+                                        f"{new_request_count}/{max_req}"
+                                    )
             key_data["last_failure"] = {
                 "timestamp": now_ts,
                 "model": model,
                 model_data["quota_max_requests"] = max_requests
                 model_data["quota_display"] = f"{used_requests}/{max_requests}"
+            # Sync request_count and quota_max_requests across quota group
+            group = self._get_model_quota_group(credential, model)
+            if group:
+                grouped_models = self._get_grouped_models(credential, group)
+                for grouped_model in grouped_models:
+                    if grouped_model != model:
+                        other_model_data = key_data["models"].setdefault(
+                            grouped_model,
+                            {
+                                "window_start_ts": None,
+                                "quota_reset_ts": None,
+                                "success_count": 0,
+                                "failure_count": 0,
+                                "request_count": 0,
+                                "prompt_tokens": 0,
+                                "completion_tokens": 0,
+                                "approx_cost": 0.0,
+                            },
+                        )
+                        other_model_data["request_count"] = used_requests
+                        if max_requests is not None:
+                            other_model_data["quota_max_requests"] = max_requests
+                            other_model_data["quota_display"] = (
+                                f"{used_requests}/{max_requests}"
+                            )
             lib_logger.debug(
                 f"Updated quota baseline for {mask_credential(credential)} model={model}: "
                 f"remaining={remaining_fraction:.2%}, synced_request_count={used_requests}"