vn6295337 Claude Opus 4.5 commited on
Commit
de66f3b
·
1 Parent(s): 379f970

Fix: Add rate limit handling with retry and delays

Browse files

1. LLM client: Add exponential backoff retry on 429 errors
- MAX_RETRIES=3, delays of 2s, 4s, 8s
- New _request_with_retry() helper method

2. Critic: Add 2s delay before LLM call
- Avoids hitting rate limit after Analyzer's call

3. Analyzer: Add 2s delay before revision LLM call
- Avoids hitting rate limit after Critic's call

This prevents 429 errors from cascading through the workflow.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

src/llm_client.py CHANGED
@@ -6,8 +6,14 @@ Adopts pattern from Enterprise-AI-Gateway for resilient LLM access.
6
  import os
7
  import time
8
  import requests
 
9
  from typing import Optional, Tuple
10
 
 
 
 
 
 
11
  class LLMClient:
12
  """LLM client with automatic provider fallback."""
13
 
@@ -93,8 +99,35 @@ class LLMClient:
93
 
94
  return None, None, f"All LLM providers failed: {'; '.join(errors)}", providers_failed
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def _call_provider(self, provider: dict, prompt: str, temperature: float, max_tokens: int) -> Tuple[Optional[str], Optional[str]]:
97
- """Call a specific LLM provider."""
98
  headers = {"Content-Type": "application/json"}
99
 
100
  if provider["name"] == "groq":
@@ -105,8 +138,7 @@ class LLMClient:
105
  "max_tokens": max_tokens,
106
  "temperature": temperature,
107
  }
108
- response = requests.post(provider["url"], headers=headers, json=payload, timeout=30)
109
- response.raise_for_status()
110
  data = response.json()
111
  if data and "choices" in data and data["choices"]:
112
  return data["choices"][0]["message"]["content"], None
@@ -121,8 +153,7 @@ class LLMClient:
121
  "maxOutputTokens": max_tokens,
122
  }
123
  }
124
- response = requests.post(url, headers=headers, json=payload, timeout=30)
125
- response.raise_for_status()
126
  data = response.json()
127
  if data and "candidates" in data and data["candidates"]:
128
  first_candidate = data["candidates"][0]
@@ -142,8 +173,7 @@ class LLMClient:
142
  "max_tokens": max_tokens,
143
  "temperature": temperature,
144
  }
145
- response = requests.post(provider["url"], headers=headers, json=payload, timeout=30)
146
- response.raise_for_status()
147
  data = response.json()
148
  if data and "choices" in data and data["choices"]:
149
  return data["choices"][0]["message"]["content"], None
 
6
  import os
7
  import time
8
  import requests
9
+ from requests.exceptions import HTTPError
10
  from typing import Optional, Tuple
11
 
12
+ # Retry configuration for rate limits
13
+ MAX_RETRIES = 3
14
+ INITIAL_BACKOFF = 2 # seconds
15
+
16
+
17
  class LLMClient:
18
  """LLM client with automatic provider fallback."""
19
 
 
99
 
100
  return None, None, f"All LLM providers failed: {'; '.join(errors)}", providers_failed
101
 
102
+ def _request_with_retry(self, url: str, headers: dict, payload: dict, provider_name: str) -> requests.Response:
103
+ """Make HTTP request with exponential backoff retry on 429 rate limit."""
104
+ last_error = None
105
+
106
+ for attempt in range(MAX_RETRIES):
107
+ try:
108
+ response = requests.post(url, headers=headers, json=payload, timeout=30)
109
+ response.raise_for_status()
110
+ return response
111
+ except HTTPError as e:
112
+ if e.response is not None and e.response.status_code == 429:
113
+ last_error = e
114
+ if attempt < MAX_RETRIES - 1:
115
+ backoff = INITIAL_BACKOFF * (2 ** attempt) # 2s, 4s, 8s
116
+ print(f"Rate limited by {provider_name}, retrying in {backoff}s (attempt {attempt + 1}/{MAX_RETRIES})...")
117
+ time.sleep(backoff)
118
+ continue
119
+ # Re-raise non-429 errors or final 429
120
+ raise
121
+ except Exception:
122
+ raise
123
+
124
+ # Should not reach here, but just in case
125
+ if last_error:
126
+ raise last_error
127
+ raise Exception(f"Request failed after {MAX_RETRIES} attempts")
128
+
129
  def _call_provider(self, provider: dict, prompt: str, temperature: float, max_tokens: int) -> Tuple[Optional[str], Optional[str]]:
130
+ """Call a specific LLM provider with retry on rate limit."""
131
  headers = {"Content-Type": "application/json"}
132
 
133
  if provider["name"] == "groq":
 
138
  "max_tokens": max_tokens,
139
  "temperature": temperature,
140
  }
141
+ response = self._request_with_retry(provider["url"], headers, payload, provider["name"])
 
142
  data = response.json()
143
  if data and "choices" in data and data["choices"]:
144
  return data["choices"][0]["message"]["content"], None
 
153
  "maxOutputTokens": max_tokens,
154
  }
155
  }
156
+ response = self._request_with_retry(url, headers, payload, provider["name"])
 
157
  data = response.json()
158
  if data and "candidates" in data and data["candidates"]:
159
  first_candidate = data["candidates"][0]
 
173
  "max_tokens": max_tokens,
174
  "temperature": temperature,
175
  }
176
+ response = self._request_with_retry(provider["url"], headers, payload, provider["name"])
 
177
  data = response.json()
178
  if data and "choices" in data and data["choices"]:
179
  return data["choices"][0]["message"]["content"], None
src/nodes/analyzer.py CHANGED
@@ -1019,6 +1019,12 @@ def analyzer_node(state, workflow_id=None, progress_store=None):
1019
  prompt = _build_analyzer_prompt(company, ticker, formatted_data, is_financial)
1020
  current_revision = 0
1021
 
 
 
 
 
 
 
1022
  start_time = time.time()
1023
  response, provider, error, providers_failed = llm.query(prompt, temperature=0)
1024
  elapsed = time.time() - start_time
 
1019
  prompt = _build_analyzer_prompt(company, ticker, formatted_data, is_financial)
1020
  current_revision = 0
1021
 
1022
+ # In revision mode, add delay before LLM call to avoid rate limits
1023
+ # (Critic just called LLM, so we need to wait)
1024
+ if is_revision:
1025
+ print("Waiting 2s before revision LLM call (rate limit buffer)...")
1026
+ time.sleep(2)
1027
+
1028
  start_time = time.time()
1029
  response, provider, error, providers_failed = llm.query(prompt, temperature=0)
1030
  elapsed = time.time() - start_time
src/nodes/critic.py CHANGED
@@ -320,6 +320,11 @@ def critic_node(state, workflow_id=None, progress_store=None):
320
  # Run LLM evaluation
321
  print(f"Running LLM evaluation (iteration {iteration})...")
322
  llm = get_llm_client()
 
 
 
 
 
323
  _add_activity_log(workflow_id, progress_store, "critic", "Calling LLM for quality evaluation...")
324
  start_time = time.time()
325
 
 
320
  # Run LLM evaluation
321
  print(f"Running LLM evaluation (iteration {iteration})...")
322
  llm = get_llm_client()
323
+
324
+ # Add delay before LLM call to avoid rate limits (Analyzer just called LLM)
325
+ print("Waiting 2s before Critic LLM call (rate limit buffer)...")
326
+ time.sleep(2)
327
+
328
  _add_activity_log(workflow_id, progress_store, "critic", "Calling LLM for quality evaluation...")
329
  start_time = time.time()
330