Spaces:
Running
Running
Pulastya B
commited on
Commit
Β·
3e672a1
1
Parent(s):
7c2ff18
Fix TPM rate limiting: Add automatic retry with delay parsing
Browse files- Catch 429 TPM errors and automatically retry after delay
- Parse retry delay from Groq error message (e.g., 'retry in 45s')
- Default to 60s wait for TPM limits
- Distinguish TPM (retry) from TPD (fail immediately)
- Explain rolling window: previous requests in 60s window still count
Root cause: Container starts with tokens_this_minute=0, but Groq uses account-wide rolling window. Previous requests from any session count toward limit.
Solution: Graceful retry instead of failing on first request.
- src/orchestrator.py +47 -12
src/orchestrator.py
CHANGED
|
@@ -1690,20 +1690,55 @@ You are a DOER. Complete workflows based on user intent."""
|
|
| 1690 |
# Check if it's a rate limit error (429)
|
| 1691 |
error_str = str(groq_error)
|
| 1692 |
if "rate_limit" in error_str.lower() or "429" in error_str:
|
| 1693 |
-
#
|
| 1694 |
-
|
| 1695 |
-
|
| 1696 |
-
|
| 1697 |
-
|
|
|
|
|
|
|
|
|
|
| 1698 |
elif "tokens per minute" in error_str or "TPM" in error_str:
|
| 1699 |
-
|
| 1700 |
-
|
| 1701 |
-
|
| 1702 |
-
|
| 1703 |
-
print(f"
|
| 1704 |
-
print(f" Error: {error_str[:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1705 |
|
| 1706 |
-
|
|
|
|
|
|
|
| 1707 |
else:
|
| 1708 |
# Not a rate limit error, re-raise
|
| 1709 |
raise
|
|
|
|
| 1690 |
# Check if it's a rate limit error (429)
|
| 1691 |
error_str = str(groq_error)
|
| 1692 |
if "rate_limit" in error_str.lower() or "429" in error_str:
|
| 1693 |
+
# Parse retry delay from error message if available
|
| 1694 |
+
retry_delay = 60 # Default to 60s for TPM limit
|
| 1695 |
+
|
| 1696 |
+
# Try to extract retry delay from error
|
| 1697 |
+
import re
|
| 1698 |
+
delay_match = re.search(r'retry.*?(\d+).*?second', error_str, re.IGNORECASE)
|
| 1699 |
+
if delay_match:
|
| 1700 |
+
retry_delay = int(delay_match.group(1))
|
| 1701 |
elif "tokens per minute" in error_str or "TPM" in error_str:
|
| 1702 |
+
retry_delay = 60
|
| 1703 |
+
elif "tokens per day" in error_str or "TPD" in error_str:
|
| 1704 |
+
# Daily limit - give up immediately
|
| 1705 |
+
print(f"β GROQ DAILY TOKEN LIMIT EXHAUSTED (100K tokens/day)")
|
| 1706 |
+
print(f" Your daily quota resets at UTC midnight")
|
| 1707 |
+
print(f" Error: {error_str[:400]}")
|
| 1708 |
+
raise ValueError(f"Groq daily quota exhausted. Please wait for reset.\n{error_str[:500]}")
|
| 1709 |
+
|
| 1710 |
+
# TPM limit - wait and retry
|
| 1711 |
+
print(f"β οΈ GROQ TPM RATE LIMIT (rolling 60s window)")
|
| 1712 |
+
print(f" Groq uses account-wide rolling window - previous requests still count")
|
| 1713 |
+
print(f" Waiting {retry_delay}s and retrying...")
|
| 1714 |
+
print(f" Error: {error_str[:300]}")
|
| 1715 |
+
|
| 1716 |
+
time.sleep(retry_delay)
|
| 1717 |
+
|
| 1718 |
+
# Retry the request
|
| 1719 |
+
print(f"π Retrying after {retry_delay}s delay...")
|
| 1720 |
+
response = self.groq_client.chat.completions.create(
|
| 1721 |
+
model=self.model,
|
| 1722 |
+
messages=messages,
|
| 1723 |
+
tools=tools_to_use,
|
| 1724 |
+
tool_choice="auto",
|
| 1725 |
+
parallel_tool_calls=False,
|
| 1726 |
+
temperature=0.1,
|
| 1727 |
+
max_tokens=4096
|
| 1728 |
+
)
|
| 1729 |
+
|
| 1730 |
+
self.api_calls_made += 1
|
| 1731 |
+
self.last_api_call_time = time.time()
|
| 1732 |
+
|
| 1733 |
+
# Track tokens used
|
| 1734 |
+
if hasattr(response, 'usage') and response.usage:
|
| 1735 |
+
tokens_used = response.usage.total_tokens
|
| 1736 |
+
self.tokens_this_minute += tokens_used
|
| 1737 |
+
print(f"π Tokens: {tokens_used} this call | {self.tokens_this_minute}/{self.tpm_limit} this minute")
|
| 1738 |
|
| 1739 |
+
response_message = response.choices[0].message
|
| 1740 |
+
tool_calls = response_message.tool_calls
|
| 1741 |
+
final_content = response_message.content
|
| 1742 |
else:
|
| 1743 |
# Not a rate limit error, re-raise
|
| 1744 |
raise
|