Nanny7 Claude Sonnet 4.5 commited on
Commit
fe33dd1
·
1 Parent(s): 67b26d6

fix: use synchronous client to resolve StopIteration error

Browse files

- Switched from AsyncInferenceClient to InferenceClient
- Removed async/await from generate method
- Use stream=False to get complete response
- More reliable and eliminates StopIteration error

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

backend/src/ai/qwen_client.py CHANGED
@@ -3,12 +3,11 @@
3
  # Qwen Client - Hugging Face SDK wrapper with retry logic
4
 
5
  import os
6
- import asyncio
7
  import logging
8
  from typing import List, Dict, Any, Optional
9
  import random
10
 
11
- from huggingface_hub import AsyncInferenceClient
12
 
13
 
14
  logger = logging.getLogger(__name__)
@@ -46,12 +45,12 @@ class QwenClient:
46
  "Please set it in your .env file."
47
  )
48
 
49
- # Initialize async inference client
50
- self.client = AsyncInferenceClient(model=self.model, token=self.api_key)
51
 
52
  logger.info(f"Qwen client initialized with model: {self.model}, timeout: {timeout}s")
53
 
54
- async def generate(
55
  self,
56
  messages: List[Dict[str, str]],
57
  temperature: float = 0.7,
@@ -71,6 +70,8 @@ class QwenClient:
71
  Raises:
72
  Exception: If all retries exhausted
73
  """
 
 
74
  for attempt in range(self.max_retries):
75
  try:
76
  logger.info(f"Qwen inference attempt {attempt + 1}/{self.max_retries}")
@@ -78,33 +79,18 @@ class QwenClient:
78
  # Build prompt from messages
79
  prompt = self._build_prompt(messages)
80
 
81
- # Call Hugging Face API - text_generation returns an async generator
82
- # We need to collect the full response
83
- response_parts = []
84
- async def collect_response():
85
- async for chunk in self.client.text_generation(
86
- prompt=prompt,
87
- temperature=temperature,
88
- max_new_tokens=max_tokens,
89
- do_sample=True
90
- ):
91
- response_parts.append(chunk)
92
- return "".join(response_parts)
93
-
94
- response = await asyncio.wait_for(collect_response(), timeout=self.timeout)
95
 
96
  logger.info("Qwen inference successful")
97
  return response.strip()
98
 
99
- except asyncio.TimeoutError:
100
- logger.warning(f"Qwen inference timeout on attempt {attempt + 1}")
101
- if attempt == self.max_retries - 1:
102
- raise TimeoutError(f"Qwen inference timed out after {self.max_retries} attempts")
103
- # Exponential backoff with jitter
104
- wait_time = (2 ** attempt) + random.uniform(0, 1)
105
- logger.info(f"Retrying in {wait_time:.2f}s...")
106
- await asyncio.sleep(wait_time)
107
-
108
  except Exception as e:
109
  logger.error(f"Qwen inference failed on attempt {attempt + 1}: {str(e)}")
110
  if attempt == self.max_retries - 1:
@@ -113,11 +99,11 @@ class QwenClient:
113
  # Check if it's a rate limit error (HTTP 429)
114
  if "429" in str(e) or "rate limit" in str(e).lower():
115
  logger.warning("Rate limit detected, waiting 60 seconds...")
116
- await asyncio.sleep(60)
117
  else:
118
  wait_time = (2 ** attempt) + random.uniform(0, 1)
119
  logger.info(f"Retrying in {wait_time:.2f}s...")
120
- await asyncio.sleep(wait_time)
121
 
122
  def _build_prompt(self, messages: List[Dict[str, str]]) -> str:
123
  """
 
3
  # Qwen Client - Hugging Face SDK wrapper with retry logic
4
 
5
  import os
 
6
  import logging
7
  from typing import List, Dict, Any, Optional
8
  import random
9
 
10
+ from huggingface_hub import InferenceClient
11
 
12
 
13
  logger = logging.getLogger(__name__)
 
45
  "Please set it in your .env file."
46
  )
47
 
48
+ # Initialize synchronous inference client (more reliable than async)
49
+ self.client = InferenceClient(model=self.model, token=self.api_key)
50
 
51
  logger.info(f"Qwen client initialized with model: {self.model}, timeout: {timeout}s")
52
 
53
+ def generate(
54
  self,
55
  messages: List[Dict[str, str]],
56
  temperature: float = 0.7,
 
70
  Raises:
71
  Exception: If all retries exhausted
72
  """
73
+ import time
74
+
75
  for attempt in range(self.max_retries):
76
  try:
77
  logger.info(f"Qwen inference attempt {attempt + 1}/{self.max_retries}")
 
79
  # Build prompt from messages
80
  prompt = self._build_prompt(messages)
81
 
82
+ # Call Hugging Face API with synchronous client
83
+ response = self.client.text_generation(
84
+ prompt=prompt,
85
+ temperature=temperature,
86
+ max_new_tokens=max_tokens,
87
+ do_sample=True,
88
+ stream=False # Get complete response, not a generator
89
+ )
 
 
 
 
 
 
90
 
91
  logger.info("Qwen inference successful")
92
  return response.strip()
93
 
 
 
 
 
 
 
 
 
 
94
  except Exception as e:
95
  logger.error(f"Qwen inference failed on attempt {attempt + 1}: {str(e)}")
96
  if attempt == self.max_retries - 1:
 
99
  # Check if it's a rate limit error (HTTP 429)
100
  if "429" in str(e) or "rate limit" in str(e).lower():
101
  logger.warning("Rate limit detected, waiting 60 seconds...")
102
+ time.sleep(60)
103
  else:
104
  wait_time = (2 ** attempt) + random.uniform(0, 1)
105
  logger.info(f"Retrying in {wait_time:.2f}s...")
106
+ time.sleep(wait_time)
107
 
108
  def _build_prompt(self, messages: List[Dict[str, str]]) -> str:
109
  """
backend/src/api/chat.py CHANGED
@@ -189,7 +189,7 @@ async def chat(
189
  ]
190
 
191
  # Get AI response
192
- ai_response = await qwen_client.generate(qwen_messages)
193
 
194
  # Check if AI wants to call a tool
195
  tool_call = extract_tool_call(ai_response)
@@ -212,7 +212,7 @@ async def chat(
212
  {"role": "user", "content": f"Tool executed successfully. Here is the result:\n{tool_result_text}\n\nPlease format this for the user in {language}."}
213
  ]
214
 
215
- final_response = await qwen_client.generate(followup_messages)
216
 
217
  # Save assistant response
218
  conv_repo.add_message(
 
189
  ]
190
 
191
  # Get AI response
192
+ ai_response = qwen_client.generate(qwen_messages)
193
 
194
  # Check if AI wants to call a tool
195
  tool_call = extract_tool_call(ai_response)
 
212
  {"role": "user", "content": f"Tool executed successfully. Here is the result:\n{tool_result_text}\n\nPlease format this for the user in {language}."}
213
  ]
214
 
215
+ final_response = qwen_client.generate(followup_messages)
216
 
217
  # Save assistant response
218
  conv_repo.add_message(