Fix Gemini RPM (4 not 14) and add daily quota warnings at 50/70/90/99%
Browse filesFree tier is 5 RPM (not 15) — using max_rpm=14 caused every call to 429.
Now uses max_rpm=4 to stay safely under 5 RPM.
Adds _track_daily_request() called on every successful response:
- Warns at 50%, 70%, 90%, 99% of GEMINI_DAILY_LIMIT (default 1500 RPD)
- Resets counter at midnight Pacific Time
- Daily limit overridable via GEMINI_DAILY_LIMIT env var
Free tier facts: 5 RPM, ~1500 RPD, 250k TPM; resets midnight Pacific.
Cost on paid tier: ~$0.000133/request (~$0.20/day at 1500 RPD).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- src/soci/engine/llm.py +39 -3
src/soci/engine/llm.py
CHANGED
|
@@ -625,10 +625,18 @@ class GroqClient:
|
|
| 625 |
class GeminiClient:
|
| 626 |
"""Google Gemini via the OpenAI-compatible AI Studio endpoint.
|
| 627 |
|
| 628 |
-
Free tier (no credit card):
|
| 629 |
-
- gemini-2.0-flash:
|
|
|
|
|
|
|
| 630 |
- Get a free key at https://aistudio.google.com/apikey
|
| 631 |
Uses the OpenAI-compatible endpoint so no extra SDK is needed.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
"""
|
| 633 |
|
| 634 |
def __init__(
|
|
@@ -636,7 +644,8 @@ class GeminiClient:
|
|
| 636 |
api_key: Optional[str] = None,
|
| 637 |
default_model: str = MODEL_GEMINI_FLASH,
|
| 638 |
max_retries: int = 3,
|
| 639 |
-
max_rpm: int =
|
|
|
|
| 640 |
) -> None:
|
| 641 |
self.api_key = api_key or os.environ.get("GEMINI_API_KEY", "")
|
| 642 |
if not self.api_key:
|
|
@@ -660,10 +669,35 @@ class GeminiClient:
|
|
| 660 |
self._last_request_time: float = 0.0
|
| 661 |
self._rate_lock = asyncio.Lock()
|
| 662 |
self._rate_limited_until: float = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 663 |
|
| 664 |
def _is_quota_exhausted(self) -> bool:
|
| 665 |
return time.monotonic() < self._rate_limited_until
|
| 666 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
async def _wait_for_rate_limit(self) -> None:
|
| 668 |
async with self._rate_lock:
|
| 669 |
now = time.monotonic()
|
|
@@ -717,6 +751,7 @@ class GeminiClient:
|
|
| 717 |
data = resp.json()
|
| 718 |
usage = data.get("usage", {})
|
| 719 |
self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
|
|
|
|
| 720 |
return data["choices"][0]["message"]["content"]
|
| 721 |
except httpx.HTTPStatusError as e:
|
| 722 |
if e.response.status_code == 429:
|
|
@@ -787,6 +822,7 @@ class GeminiClient:
|
|
| 787 |
data = resp.json()
|
| 788 |
usage = data.get("usage", {})
|
| 789 |
self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
|
|
|
|
| 790 |
text = data["choices"][0]["message"]["content"]
|
| 791 |
return _parse_json_response(text)
|
| 792 |
except httpx.HTTPStatusError as e:
|
|
|
|
| 625 |
class GeminiClient:
|
| 626 |
"""Google Gemini via the OpenAI-compatible AI Studio endpoint.
|
| 627 |
|
| 628 |
+
Free tier (no credit card, as of 2026):
|
| 629 |
+
- gemini-2.0-flash: 5 RPM, ~1,500 RPD, 250,000 TPM
|
| 630 |
+
- Daily quota resets at midnight Pacific Time
|
| 631 |
+
- Paid tier: $0.10/1M input tokens, $0.40/1M output tokens
|
| 632 |
- Get a free key at https://aistudio.google.com/apikey
|
| 633 |
Uses the OpenAI-compatible endpoint so no extra SDK is needed.
|
| 634 |
+
|
| 635 |
+
Token/cost guide (typical Soci request ~1,000 input + 90 output tokens):
|
| 636 |
+
- Cost per request (paid): ~$0.000133 ($0.10 input + $0.40 output per 1M)
|
| 637 |
+
- 1,500 RPD free tier ≈ $0.20/day on paid tier
|
| 638 |
+
- 500 RPD usage ≈ $0.07/day
|
| 639 |
+
- Override daily limit via GEMINI_DAILY_LIMIT env var.
|
| 640 |
"""
|
| 641 |
|
| 642 |
def __init__(
|
|
|
|
| 644 |
api_key: Optional[str] = None,
|
| 645 |
default_model: str = MODEL_GEMINI_FLASH,
|
| 646 |
max_retries: int = 3,
|
| 647 |
+
max_rpm: int = 4, # stay under 5 RPM free-tier limit (was 14, caused constant 429s)
|
| 648 |
+
daily_limit: int = 1500, # free-tier RPD; override with GEMINI_DAILY_LIMIT
|
| 649 |
) -> None:
|
| 650 |
self.api_key = api_key or os.environ.get("GEMINI_API_KEY", "")
|
| 651 |
if not self.api_key:
|
|
|
|
| 669 |
self._last_request_time: float = 0.0
|
| 670 |
self._rate_lock = asyncio.Lock()
|
| 671 |
self._rate_limited_until: float = 0.0
|
| 672 |
+
# Daily usage tracking — resets at midnight Pacific (UTC-8/-7)
|
| 673 |
+
self._daily_limit: int = int(os.environ.get("GEMINI_DAILY_LIMIT", str(daily_limit)))
|
| 674 |
+
self._daily_requests: int = 0
|
| 675 |
+
self._daily_date: str = "" # "YYYY-MM-DD" in Pacific time
|
| 676 |
+
self._warned_thresholds: set = set() # tracks which % levels were already logged
|
| 677 |
|
| 678 |
def _is_quota_exhausted(self) -> bool:
|
| 679 |
return time.monotonic() < self._rate_limited_until
|
| 680 |
|
| 681 |
+
def _track_daily_request(self) -> None:
|
| 682 |
+
"""Increment daily counter and log warnings at 50/70/90/99% of the daily limit."""
|
| 683 |
+
import datetime as _dt
|
| 684 |
+
# Pacific time offset: UTC-8 (PST) / UTC-7 (PDT). Use -8 as a safe conservative value.
|
| 685 |
+
pacific_offset = _dt.timezone(_dt.timedelta(hours=-8))
|
| 686 |
+
today = _dt.datetime.now(pacific_offset).strftime("%Y-%m-%d")
|
| 687 |
+
if today != self._daily_date:
|
| 688 |
+
self._daily_date = today
|
| 689 |
+
self._daily_requests = 0
|
| 690 |
+
self._warned_thresholds = set()
|
| 691 |
+
self._daily_requests += 1
|
| 692 |
+
pct = self._daily_requests / self._daily_limit
|
| 693 |
+
for threshold in (0.50, 0.70, 0.90, 0.99):
|
| 694 |
+
if pct >= threshold and threshold not in self._warned_thresholds:
|
| 695 |
+
self._warned_thresholds.add(threshold)
|
| 696 |
+
logger.warning(
|
| 697 |
+
f"Gemini daily quota: {self._daily_requests}/{self._daily_limit} requests used "
|
| 698 |
+
f"({pct * 100:.0f}%) — resets at midnight Pacific Time"
|
| 699 |
+
)
|
| 700 |
+
|
| 701 |
async def _wait_for_rate_limit(self) -> None:
|
| 702 |
async with self._rate_lock:
|
| 703 |
now = time.monotonic()
|
|
|
|
| 751 |
data = resp.json()
|
| 752 |
usage = data.get("usage", {})
|
| 753 |
self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
|
| 754 |
+
self._track_daily_request()
|
| 755 |
return data["choices"][0]["message"]["content"]
|
| 756 |
except httpx.HTTPStatusError as e:
|
| 757 |
if e.response.status_code == 429:
|
|
|
|
| 822 |
data = resp.json()
|
| 823 |
usage = data.get("usage", {})
|
| 824 |
self.usage.record(model, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
|
| 825 |
+
self._track_daily_request()
|
| 826 |
text = data["choices"][0]["message"]["content"]
|
| 827 |
return _parse_json_response(text)
|
| 828 |
except httpx.HTTPStatusError as e:
|