fix: use synchronous client to resolve StopIteration error
Browse files- Switched from AsyncInferenceClient to InferenceClient
- Removed async/await from generate method
- Use stream=False to get complete response
- More reliable and eliminates StopIteration error
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
- backend/src/ai/qwen_client.py +16 -30
- backend/src/api/chat.py +2 -2
backend/src/ai/qwen_client.py
CHANGED
|
@@ -3,12 +3,11 @@
|
|
| 3 |
# Qwen Client - Hugging Face SDK wrapper with retry logic
|
| 4 |
|
| 5 |
import os
|
| 6 |
-
import asyncio
|
| 7 |
import logging
|
| 8 |
from typing import List, Dict, Any, Optional
|
| 9 |
import random
|
| 10 |
|
| 11 |
-
from huggingface_hub import
|
| 12 |
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
|
@@ -46,12 +45,12 @@ class QwenClient:
|
|
| 46 |
"Please set it in your .env file."
|
| 47 |
)
|
| 48 |
|
| 49 |
-
# Initialize
|
| 50 |
-
self.client =
|
| 51 |
|
| 52 |
logger.info(f"Qwen client initialized with model: {self.model}, timeout: {timeout}s")
|
| 53 |
|
| 54 |
-
|
| 55 |
self,
|
| 56 |
messages: List[Dict[str, str]],
|
| 57 |
temperature: float = 0.7,
|
|
@@ -71,6 +70,8 @@ class QwenClient:
|
|
| 71 |
Raises:
|
| 72 |
Exception: If all retries exhausted
|
| 73 |
"""
|
|
|
|
|
|
|
| 74 |
for attempt in range(self.max_retries):
|
| 75 |
try:
|
| 76 |
logger.info(f"Qwen inference attempt {attempt + 1}/{self.max_retries}")
|
|
@@ -78,33 +79,18 @@ class QwenClient:
|
|
| 78 |
# Build prompt from messages
|
| 79 |
prompt = self._build_prompt(messages)
|
| 80 |
|
| 81 |
-
# Call Hugging Face API
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
do_sample=True
|
| 90 |
-
):
|
| 91 |
-
response_parts.append(chunk)
|
| 92 |
-
return "".join(response_parts)
|
| 93 |
-
|
| 94 |
-
response = await asyncio.wait_for(collect_response(), timeout=self.timeout)
|
| 95 |
|
| 96 |
logger.info("Qwen inference successful")
|
| 97 |
return response.strip()
|
| 98 |
|
| 99 |
-
except asyncio.TimeoutError:
|
| 100 |
-
logger.warning(f"Qwen inference timeout on attempt {attempt + 1}")
|
| 101 |
-
if attempt == self.max_retries - 1:
|
| 102 |
-
raise TimeoutError(f"Qwen inference timed out after {self.max_retries} attempts")
|
| 103 |
-
# Exponential backoff with jitter
|
| 104 |
-
wait_time = (2 ** attempt) + random.uniform(0, 1)
|
| 105 |
-
logger.info(f"Retrying in {wait_time:.2f}s...")
|
| 106 |
-
await asyncio.sleep(wait_time)
|
| 107 |
-
|
| 108 |
except Exception as e:
|
| 109 |
logger.error(f"Qwen inference failed on attempt {attempt + 1}: {str(e)}")
|
| 110 |
if attempt == self.max_retries - 1:
|
|
@@ -113,11 +99,11 @@ class QwenClient:
|
|
| 113 |
# Check if it's a rate limit error (HTTP 429)
|
| 114 |
if "429" in str(e) or "rate limit" in str(e).lower():
|
| 115 |
logger.warning("Rate limit detected, waiting 60 seconds...")
|
| 116 |
-
|
| 117 |
else:
|
| 118 |
wait_time = (2 ** attempt) + random.uniform(0, 1)
|
| 119 |
logger.info(f"Retrying in {wait_time:.2f}s...")
|
| 120 |
-
|
| 121 |
|
| 122 |
def _build_prompt(self, messages: List[Dict[str, str]]) -> str:
|
| 123 |
"""
|
|
|
|
| 3 |
# Qwen Client - Hugging Face SDK wrapper with retry logic
|
| 4 |
|
| 5 |
import os
|
|
|
|
| 6 |
import logging
|
| 7 |
from typing import List, Dict, Any, Optional
|
| 8 |
import random
|
| 9 |
|
| 10 |
+
from huggingface_hub import InferenceClient
|
| 11 |
|
| 12 |
|
| 13 |
logger = logging.getLogger(__name__)
|
|
|
|
| 45 |
"Please set it in your .env file."
|
| 46 |
)
|
| 47 |
|
| 48 |
+
# Initialize synchronous inference client (more reliable than async)
|
| 49 |
+
self.client = InferenceClient(model=self.model, token=self.api_key)
|
| 50 |
|
| 51 |
logger.info(f"Qwen client initialized with model: {self.model}, timeout: {timeout}s")
|
| 52 |
|
| 53 |
+
def generate(
|
| 54 |
self,
|
| 55 |
messages: List[Dict[str, str]],
|
| 56 |
temperature: float = 0.7,
|
|
|
|
| 70 |
Raises:
|
| 71 |
Exception: If all retries exhausted
|
| 72 |
"""
|
| 73 |
+
import time
|
| 74 |
+
|
| 75 |
for attempt in range(self.max_retries):
|
| 76 |
try:
|
| 77 |
logger.info(f"Qwen inference attempt {attempt + 1}/{self.max_retries}")
|
|
|
|
| 79 |
# Build prompt from messages
|
| 80 |
prompt = self._build_prompt(messages)
|
| 81 |
|
| 82 |
+
# Call Hugging Face API with synchronous client
|
| 83 |
+
response = self.client.text_generation(
|
| 84 |
+
prompt=prompt,
|
| 85 |
+
temperature=temperature,
|
| 86 |
+
max_new_tokens=max_tokens,
|
| 87 |
+
do_sample=True,
|
| 88 |
+
stream=False # Get complete response, not a generator
|
| 89 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
logger.info("Qwen inference successful")
|
| 92 |
return response.strip()
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
except Exception as e:
|
| 95 |
logger.error(f"Qwen inference failed on attempt {attempt + 1}: {str(e)}")
|
| 96 |
if attempt == self.max_retries - 1:
|
|
|
|
| 99 |
# Check if it's a rate limit error (HTTP 429)
|
| 100 |
if "429" in str(e) or "rate limit" in str(e).lower():
|
| 101 |
logger.warning("Rate limit detected, waiting 60 seconds...")
|
| 102 |
+
time.sleep(60)
|
| 103 |
else:
|
| 104 |
wait_time = (2 ** attempt) + random.uniform(0, 1)
|
| 105 |
logger.info(f"Retrying in {wait_time:.2f}s...")
|
| 106 |
+
time.sleep(wait_time)
|
| 107 |
|
| 108 |
def _build_prompt(self, messages: List[Dict[str, str]]) -> str:
|
| 109 |
"""
|
backend/src/api/chat.py
CHANGED
|
@@ -189,7 +189,7 @@ async def chat(
|
|
| 189 |
]
|
| 190 |
|
| 191 |
# Get AI response
|
| 192 |
-
ai_response =
|
| 193 |
|
| 194 |
# Check if AI wants to call a tool
|
| 195 |
tool_call = extract_tool_call(ai_response)
|
|
@@ -212,7 +212,7 @@ async def chat(
|
|
| 212 |
{"role": "user", "content": f"Tool executed successfully. Here is the result:\n{tool_result_text}\n\nPlease format this for the user in {language}."}
|
| 213 |
]
|
| 214 |
|
| 215 |
-
final_response =
|
| 216 |
|
| 217 |
# Save assistant response
|
| 218 |
conv_repo.add_message(
|
|
|
|
| 189 |
]
|
| 190 |
|
| 191 |
# Get AI response
|
| 192 |
+
ai_response = qwen_client.generate(qwen_messages)
|
| 193 |
|
| 194 |
# Check if AI wants to call a tool
|
| 195 |
tool_call = extract_tool_call(ai_response)
|
|
|
|
| 212 |
{"role": "user", "content": f"Tool executed successfully. Here is the result:\n{tool_result_text}\n\nPlease format this for the user in {language}."}
|
| 213 |
]
|
| 214 |
|
| 215 |
+
final_response = qwen_client.generate(followup_messages)
|
| 216 |
|
| 217 |
# Save assistant response
|
| 218 |
conv_repo.add_message(
|