Spaces:

ammaraak
/

todo-api

Configuration error

Nanny7 Claude Sonnet 4.5 commited on 23 days ago

Commit

fe33dd1

1 Parent(s): 67b26d6

fix: use synchronous client to resolve StopIteration error

- Switched from AsyncInferenceClient to InferenceClient
- Removed async/await from generate method
- Use stream=False to get complete response
- More reliable and eliminates StopIteration error

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show

backend/src/ai/qwen_client.py +16 -30
backend/src/api/chat.py +2 -2

backend/src/ai/qwen_client.py CHANGED Viewed

@@ -3,12 +3,11 @@
 # Qwen Client - Hugging Face SDK wrapper with retry logic
 import os
-import asyncio
 import logging
 from typing import List, Dict, Any, Optional
 import random
-from huggingface_hub import AsyncInferenceClient
 logger = logging.getLogger(__name__)
@@ -46,12 +45,12 @@ class QwenClient:
                 "Please set it in your .env file."
             )
-        # Initialize async inference client
-        self.client = AsyncInferenceClient(model=self.model, token=self.api_key)
         logger.info(f"Qwen client initialized with model: {self.model}, timeout: {timeout}s")
-    async def generate(
         self,
         messages: List[Dict[str, str]],
         temperature: float = 0.7,
@@ -71,6 +70,8 @@ class QwenClient:
         Raises:
             Exception: If all retries exhausted
         """
         for attempt in range(self.max_retries):
             try:
                 logger.info(f"Qwen inference attempt {attempt + 1}/{self.max_retries}")
@@ -78,33 +79,18 @@ class QwenClient:
                 # Build prompt from messages
                 prompt = self._build_prompt(messages)
-                # Call Hugging Face API - text_generation returns an async generator
-                # We need to collect the full response
-                response_parts = []
-                async def collect_response():
-                    async for chunk in self.client.text_generation(
-                        prompt=prompt,
-                        temperature=temperature,
-                        max_new_tokens=max_tokens,
-                        do_sample=True
-                    ):
-                        response_parts.append(chunk)
-                    return "".join(response_parts)
-                response = await asyncio.wait_for(collect_response(), timeout=self.timeout)
                 logger.info("Qwen inference successful")
                 return response.strip()
-            except asyncio.TimeoutError:
-                logger.warning(f"Qwen inference timeout on attempt {attempt + 1}")
-                if attempt == self.max_retries - 1:
-                    raise TimeoutError(f"Qwen inference timed out after {self.max_retries} attempts")
-                # Exponential backoff with jitter
-                wait_time = (2 ** attempt) + random.uniform(0, 1)
-                logger.info(f"Retrying in {wait_time:.2f}s...")
-                await asyncio.sleep(wait_time)
             except Exception as e:
                 logger.error(f"Qwen inference failed on attempt {attempt + 1}: {str(e)}")
                 if attempt == self.max_retries - 1:
@@ -113,11 +99,11 @@ class QwenClient:
                 # Check if it's a rate limit error (HTTP 429)
                 if "429" in str(e) or "rate limit" in str(e).lower():
                     logger.warning("Rate limit detected, waiting 60 seconds...")
-                    await asyncio.sleep(60)
                 else:
                     wait_time = (2 ** attempt) + random.uniform(0, 1)
                     logger.info(f"Retrying in {wait_time:.2f}s...")
-                    await asyncio.sleep(wait_time)
     def _build_prompt(self, messages: List[Dict[str, str]]) -> str:
         """

 # Qwen Client - Hugging Face SDK wrapper with retry logic
 import os
 import logging
 from typing import List, Dict, Any, Optional
 import random
+from huggingface_hub import InferenceClient
 logger = logging.getLogger(__name__)
                 "Please set it in your .env file."
             )
+        # Initialize synchronous inference client (more reliable than async)
+        self.client = InferenceClient(model=self.model, token=self.api_key)
         logger.info(f"Qwen client initialized with model: {self.model}, timeout: {timeout}s")
+    def generate(
         self,
         messages: List[Dict[str, str]],
         temperature: float = 0.7,
         Raises:
             Exception: If all retries exhausted
         """
+        import time
         for attempt in range(self.max_retries):
             try:
                 logger.info(f"Qwen inference attempt {attempt + 1}/{self.max_retries}")
                 # Build prompt from messages
                 prompt = self._build_prompt(messages)
+                # Call Hugging Face API with synchronous client
+                response = self.client.text_generation(
+                    prompt=prompt,
+                    temperature=temperature,
+                    max_new_tokens=max_tokens,
+                    do_sample=True,
+                    stream=False  # Get complete response, not a generator
+                )
                 logger.info("Qwen inference successful")
                 return response.strip()
             except Exception as e:
                 logger.error(f"Qwen inference failed on attempt {attempt + 1}: {str(e)}")
                 if attempt == self.max_retries - 1:
                 # Check if it's a rate limit error (HTTP 429)
                 if "429" in str(e) or "rate limit" in str(e).lower():
                     logger.warning("Rate limit detected, waiting 60 seconds...")
+                    time.sleep(60)
                 else:
                     wait_time = (2 ** attempt) + random.uniform(0, 1)
                     logger.info(f"Retrying in {wait_time:.2f}s...")
+                    time.sleep(wait_time)
     def _build_prompt(self, messages: List[Dict[str, str]]) -> str:
         """

backend/src/api/chat.py CHANGED Viewed

@@ -189,7 +189,7 @@ async def chat(
         ]
         # Get AI response
-        ai_response = await qwen_client.generate(qwen_messages)
         # Check if AI wants to call a tool
         tool_call = extract_tool_call(ai_response)
@@ -212,7 +212,7 @@ async def chat(
                 {"role": "user", "content": f"Tool executed successfully. Here is the result:\n{tool_result_text}\n\nPlease format this for the user in {language}."}
             ]
-            final_response = await qwen_client.generate(followup_messages)
         # Save assistant response
         conv_repo.add_message(

         ]
         # Get AI response
+        ai_response = qwen_client.generate(qwen_messages)
         # Check if AI wants to call a tool
         tool_call = extract_tool_call(ai_response)
                 {"role": "user", "content": f"Tool executed successfully. Here is the result:\n{tool_result_text}\n\nPlease format this for the user in {language}."}
             ]
+            final_response = qwen_client.generate(followup_messages)
         # Save assistant response
         conv_repo.add_message(