Spaces:
Sleeping
Sleeping
Fix: Add rate limit handling with retry and delays
Browse files1. LLM client: Add exponential backoff retry on 429 errors
- MAX_RETRIES=3, delays of 2s, 4s, 8s
- New _request_with_retry() helper method
2. Critic: Add 2s delay before LLM call
- Avoids hitting rate limit after Analyzer's call
3. Analyzer: Add 2s delay before revision LLM call
- Avoids hitting rate limit after Critic's call
This prevents 429 errors from cascading through the workflow.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- src/llm_client.py +37 -7
- src/nodes/analyzer.py +6 -0
- src/nodes/critic.py +5 -0
src/llm_client.py
CHANGED
|
@@ -6,8 +6,14 @@ Adopts pattern from Enterprise-AI-Gateway for resilient LLM access.
|
|
| 6 |
import os
|
| 7 |
import time
|
| 8 |
import requests
|
|
|
|
| 9 |
from typing import Optional, Tuple
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
class LLMClient:
|
| 12 |
"""LLM client with automatic provider fallback."""
|
| 13 |
|
|
@@ -93,8 +99,35 @@ class LLMClient:
|
|
| 93 |
|
| 94 |
return None, None, f"All LLM providers failed: {'; '.join(errors)}", providers_failed
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
def _call_provider(self, provider: dict, prompt: str, temperature: float, max_tokens: int) -> Tuple[Optional[str], Optional[str]]:
|
| 97 |
-
"""Call a specific LLM provider."""
|
| 98 |
headers = {"Content-Type": "application/json"}
|
| 99 |
|
| 100 |
if provider["name"] == "groq":
|
|
@@ -105,8 +138,7 @@ class LLMClient:
|
|
| 105 |
"max_tokens": max_tokens,
|
| 106 |
"temperature": temperature,
|
| 107 |
}
|
| 108 |
-
response =
|
| 109 |
-
response.raise_for_status()
|
| 110 |
data = response.json()
|
| 111 |
if data and "choices" in data and data["choices"]:
|
| 112 |
return data["choices"][0]["message"]["content"], None
|
|
@@ -121,8 +153,7 @@ class LLMClient:
|
|
| 121 |
"maxOutputTokens": max_tokens,
|
| 122 |
}
|
| 123 |
}
|
| 124 |
-
response =
|
| 125 |
-
response.raise_for_status()
|
| 126 |
data = response.json()
|
| 127 |
if data and "candidates" in data and data["candidates"]:
|
| 128 |
first_candidate = data["candidates"][0]
|
|
@@ -142,8 +173,7 @@ class LLMClient:
|
|
| 142 |
"max_tokens": max_tokens,
|
| 143 |
"temperature": temperature,
|
| 144 |
}
|
| 145 |
-
response =
|
| 146 |
-
response.raise_for_status()
|
| 147 |
data = response.json()
|
| 148 |
if data and "choices" in data and data["choices"]:
|
| 149 |
return data["choices"][0]["message"]["content"], None
|
|
|
|
| 6 |
import os
|
| 7 |
import time
|
| 8 |
import requests
|
| 9 |
+
from requests.exceptions import HTTPError
|
| 10 |
from typing import Optional, Tuple
|
| 11 |
|
| 12 |
+
# Retry configuration for rate limits
|
| 13 |
+
MAX_RETRIES = 3
|
| 14 |
+
INITIAL_BACKOFF = 2 # seconds
|
| 15 |
+
|
| 16 |
+
|
| 17 |
class LLMClient:
|
| 18 |
"""LLM client with automatic provider fallback."""
|
| 19 |
|
|
|
|
| 99 |
|
| 100 |
return None, None, f"All LLM providers failed: {'; '.join(errors)}", providers_failed
|
| 101 |
|
| 102 |
+
def _request_with_retry(self, url: str, headers: dict, payload: dict, provider_name: str) -> requests.Response:
|
| 103 |
+
"""Make HTTP request with exponential backoff retry on 429 rate limit."""
|
| 104 |
+
last_error = None
|
| 105 |
+
|
| 106 |
+
for attempt in range(MAX_RETRIES):
|
| 107 |
+
try:
|
| 108 |
+
response = requests.post(url, headers=headers, json=payload, timeout=30)
|
| 109 |
+
response.raise_for_status()
|
| 110 |
+
return response
|
| 111 |
+
except HTTPError as e:
|
| 112 |
+
if e.response is not None and e.response.status_code == 429:
|
| 113 |
+
last_error = e
|
| 114 |
+
if attempt < MAX_RETRIES - 1:
|
| 115 |
+
backoff = INITIAL_BACKOFF * (2 ** attempt) # 2s, 4s, 8s
|
| 116 |
+
print(f"Rate limited by {provider_name}, retrying in {backoff}s (attempt {attempt + 1}/{MAX_RETRIES})...")
|
| 117 |
+
time.sleep(backoff)
|
| 118 |
+
continue
|
| 119 |
+
# Re-raise non-429 errors or final 429
|
| 120 |
+
raise
|
| 121 |
+
except Exception:
|
| 122 |
+
raise
|
| 123 |
+
|
| 124 |
+
# Should not reach here, but just in case
|
| 125 |
+
if last_error:
|
| 126 |
+
raise last_error
|
| 127 |
+
raise Exception(f"Request failed after {MAX_RETRIES} attempts")
|
| 128 |
+
|
| 129 |
def _call_provider(self, provider: dict, prompt: str, temperature: float, max_tokens: int) -> Tuple[Optional[str], Optional[str]]:
|
| 130 |
+
"""Call a specific LLM provider with retry on rate limit."""
|
| 131 |
headers = {"Content-Type": "application/json"}
|
| 132 |
|
| 133 |
if provider["name"] == "groq":
|
|
|
|
| 138 |
"max_tokens": max_tokens,
|
| 139 |
"temperature": temperature,
|
| 140 |
}
|
| 141 |
+
response = self._request_with_retry(provider["url"], headers, payload, provider["name"])
|
|
|
|
| 142 |
data = response.json()
|
| 143 |
if data and "choices" in data and data["choices"]:
|
| 144 |
return data["choices"][0]["message"]["content"], None
|
|
|
|
| 153 |
"maxOutputTokens": max_tokens,
|
| 154 |
}
|
| 155 |
}
|
| 156 |
+
response = self._request_with_retry(url, headers, payload, provider["name"])
|
|
|
|
| 157 |
data = response.json()
|
| 158 |
if data and "candidates" in data and data["candidates"]:
|
| 159 |
first_candidate = data["candidates"][0]
|
|
|
|
| 173 |
"max_tokens": max_tokens,
|
| 174 |
"temperature": temperature,
|
| 175 |
}
|
| 176 |
+
response = self._request_with_retry(provider["url"], headers, payload, provider["name"])
|
|
|
|
| 177 |
data = response.json()
|
| 178 |
if data and "choices" in data and data["choices"]:
|
| 179 |
return data["choices"][0]["message"]["content"], None
|
src/nodes/analyzer.py
CHANGED
|
@@ -1019,6 +1019,12 @@ def analyzer_node(state, workflow_id=None, progress_store=None):
|
|
| 1019 |
prompt = _build_analyzer_prompt(company, ticker, formatted_data, is_financial)
|
| 1020 |
current_revision = 0
|
| 1021 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
start_time = time.time()
|
| 1023 |
response, provider, error, providers_failed = llm.query(prompt, temperature=0)
|
| 1024 |
elapsed = time.time() - start_time
|
|
|
|
| 1019 |
prompt = _build_analyzer_prompt(company, ticker, formatted_data, is_financial)
|
| 1020 |
current_revision = 0
|
| 1021 |
|
| 1022 |
+
# In revision mode, add delay before LLM call to avoid rate limits
|
| 1023 |
+
# (Critic just called LLM, so we need to wait)
|
| 1024 |
+
if is_revision:
|
| 1025 |
+
print("Waiting 2s before revision LLM call (rate limit buffer)...")
|
| 1026 |
+
time.sleep(2)
|
| 1027 |
+
|
| 1028 |
start_time = time.time()
|
| 1029 |
response, provider, error, providers_failed = llm.query(prompt, temperature=0)
|
| 1030 |
elapsed = time.time() - start_time
|
src/nodes/critic.py
CHANGED
|
@@ -320,6 +320,11 @@ def critic_node(state, workflow_id=None, progress_store=None):
|
|
| 320 |
# Run LLM evaluation
|
| 321 |
print(f"Running LLM evaluation (iteration {iteration})...")
|
| 322 |
llm = get_llm_client()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
_add_activity_log(workflow_id, progress_store, "critic", "Calling LLM for quality evaluation...")
|
| 324 |
start_time = time.time()
|
| 325 |
|
|
|
|
| 320 |
# Run LLM evaluation
|
| 321 |
print(f"Running LLM evaluation (iteration {iteration})...")
|
| 322 |
llm = get_llm_client()
|
| 323 |
+
|
| 324 |
+
# Add delay before LLM call to avoid rate limits (Analyzer just called LLM)
|
| 325 |
+
print("Waiting 2s before Critic LLM call (rate limit buffer)...")
|
| 326 |
+
time.sleep(2)
|
| 327 |
+
|
| 328 |
_add_activity_log(workflow_id, progress_store, "critic", "Calling LLM for quality evaluation...")
|
| 329 |
start_time = time.time()
|
| 330 |
|