Spaces:

stepfun-ai
/

Step-Audio-R1

Running

App Files Files Community

moevis commited on Nov 26, 2025

Commit

10a6457

verified ·

1 Parent(s): 9b74786

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -65

app.py CHANGED Viewed

@@ -65,10 +65,15 @@ def format_messages(system, history, user_text, audio_data_list=None):
     for item in history:
         # 支持 list of dicts 格式
         if isinstance(item, dict) and "role" in item and "content" in item:
-            messages.append(item)
         # 支持 Gradio ChatMessage 对象
         elif hasattr(item, "role") and hasattr(item, "content"):
-            messages.append({"role": item.role, "content": item.content})
     # 添加当前用户消息
     if user_text and audio_data_list:
@@ -94,6 +99,10 @@ def format_messages(system, history, user_text, audio_data_list=None):
         messages.append({"role": "user", "content": user_text})
     elif audio_data_list:
         content = []
         for audio_data in audio_data_list:
             content.append({
                 "type": "input_audio",
@@ -102,10 +111,6 @@ def format_messages(system, history, user_text, audio_data_list=None):
                     "format": "wav"
                 }
             })
-        messages.append({
-            "role": "user",
-            "content": content
-        })
     return messages
@@ -116,7 +121,8 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
         model_name = MODEL_NAME
     if not user_text and not audio_file:
-        return history or [], "Please enter text or upload audio"
     # Ensure history is a list and formatted correctly
     history = history or []
@@ -136,7 +142,8 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
     messages = format_messages(system_prompt, history, user_text, audio_data_list)
     if not messages:
-        return history or [], "Invalid input"
     # Debug: Print message format
     print(f"[DEBUG] Messages to API: {json.dumps(messages, ensure_ascii=False, indent=2)}")
@@ -144,6 +151,27 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
     for i, msg in enumerate(messages):
         print(f"[DEBUG] Message {i}: {type(msg)} - {msg}")
     try:
         with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
             response = client.post("/chat/completions", json={
@@ -165,10 +193,13 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
                     error_msg += " - Bad request"
                 elif response.status_code == 500:
                     error_msg += " - Model error"
-                return history, error_msg
             # Process streaming response
-            content_parts = []
             for line in response.iter_lines():
                 if not line:
                     continue
@@ -187,66 +218,45 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
                         if 'choices' in data and len(data['choices']) > 0:
                             delta = data['choices'][0].get('delta', {})
                             if 'content' in delta:
-                                content_parts.append(delta['content'])
                     except json.JSONDecodeError:
                         continue
-            full_content = ''.join(content_parts)
-            # Update history - only add when no error
-            history = history or []
-            # Add user message
-            if audio_file:
-                # If audio exists, show audio file and text (if any)
-                # Gradio Chatbot supports tuple (file_path,) to show file
-                # But in messages format, we need to construct proper content
-                # Here we use tuple format to let Gradio render audio player, or use HTML
-                # Simpler way: if multimodal, add messages separately
-                # 1. Add audio message
-                history.append({"role": "user", "content": gr.Audio(audio_file)})
-                # 2. If text exists, add text message
-                if user_text:
-                    history.append({"role": "user", "content": user_text})
-            else:
-                # Text only
-                history.append({"role": "user", "content": user_text})
-            # Split think and content
-            if "</think>" in full_content:
-                parts = full_content.split("</think>", 1)
-                think_content = parts[0].strip()
-                response_content = parts[1].strip()
-                # Remove possible start tag
-                if think_content.startswith("<think>"):
-                    think_content = think_content[len("<think>"):].strip()
-                # Add thinking process message (use ChatMessage and metadata)
-                if think_content:
-                    history.append(gr.ChatMessage(
-                        role="assistant",
-                        content=think_content,
-                        metadata={"title": "⏳ Thinking Process"}
-                    ))
-                # Add formal response message
-                if response_content:
-                    history.append({"role": "assistant", "content": response_content})
-            else:
-                # No think tag, add full response directly
-                assistant_text = full_content.strip()
-                if assistant_text:
-                    history.append({"role": "assistant", "content": assistant_text})
-            return history, ""
     except httpx.ConnectError:
-        return history, "❌ Cannot connect to vLLM API"
     except Exception as e:
-        return history, f"❌ Error: {str(e)}"
 # Gradio Interface
 with gr.Blocks(title="Step Audio R1") as demo:

     for item in history:
         # 支持 list of dicts 格式
         if isinstance(item, dict) and "role" in item and "content" in item:
+            # Filter out non-serializable content (e.g. gr.Audio components)
+            content = item["content"]
+            if isinstance(content, (str, list, dict)):
+                messages.append(item)
         # 支持 Gradio ChatMessage 对象
         elif hasattr(item, "role") and hasattr(item, "content"):
+            content = item.content
+            if isinstance(content, (str, list, dict)):
+                messages.append({"role": item.role, "content": content})
     # 添加当前用户消息
     if user_text and audio_data_list:
         messages.append({"role": "user", "content": user_text})
     elif audio_data_list:
         content = []
+        messages.append({
+            "role": "user",
+            "content": content
+        })
         for audio_data in audio_data_list:
             content.append({
                 "type": "input_audio",
                     "format": "wav"
                 }
             })
     return messages
         model_name = MODEL_NAME
     if not user_text and not audio_file:
+        yield history or [], "Please enter text or upload audio"
+        return
     # Ensure history is a list and formatted correctly
     history = history or []
     messages = format_messages(system_prompt, history, user_text, audio_data_list)
     if not messages:
+        yield history or [], "Invalid input"
+        return
     # Debug: Print message format
     print(f"[DEBUG] Messages to API: {json.dumps(messages, ensure_ascii=False, indent=2)}")
     for i, msg in enumerate(messages):
         print(f"[DEBUG] Message {i}: {type(msg)} - {msg}")
+    # Update history with user message immediately
+    if audio_file:
+        # 1. Add audio message
+        history.append({"role": "user", "content": gr.Audio(audio_file)})
+        # 2. If text exists, add text message
+        if user_text:
+            history.append({"role": "user", "content": user_text})
+    else:
+        # Text only
+        history.append({"role": "user", "content": user_text})
+    # Add thinking placeholder
+    history.append(gr.ChatMessage(
+        role="assistant",
+        content="",
+        metadata={"title": "⏳ Thinking Process"}
+    ))
+    yield history, "Generating..."
     try:
         with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
             response = client.post("/chat/completions", json={
                     error_msg += " - Bad request"
                 elif response.status_code == 500:
                     error_msg += " - Model error"
+                yield history, error_msg
+                return
             # Process streaming response
+            buffer = ""
+            is_thinking = True
             for line in response.iter_lines():
                 if not line:
                     continue
                         if 'choices' in data and len(data['choices']) > 0:
                             delta = data['choices'][0].get('delta', {})
                             if 'content' in delta:
+                                content = delta['content']
+                                buffer += content
+                                if is_thinking:
+                                    if "</think>" in buffer:
+                                        is_thinking = False
+                                        parts = buffer.split("</think>", 1)
+                                        think_content = parts[0]
+                                        response_content = parts[1]
+                                        if think_content.startswith("<think>"):
+                                            think_content = think_content[len("<think>"):].strip()
+                                        # Update thinking message
+                                        history[-1].content = think_content
+                                        # Add response message
+                                        history.append({"role": "assistant", "content": response_content})
+                                    else:
+                                        # Update thinking message
+                                        current_think = buffer
+                                        if current_think.startswith("<think>"):
+                                            current_think = current_think[len("<think>"):]
+                                        history[-1].content = current_think
+                                else:
+                                    # Already split, just update response message
+                                    parts = buffer.split("</think>", 1)
+                                    response_content = parts[1]
+                                    history[-1]["content"] = response_content
+                                yield history, ""
                     except json.JSONDecodeError:
                         continue
     except httpx.ConnectError:
+        yield history, "❌ Cannot connect to vLLM API"
     except Exception as e:
+        yield history, f"❌ Error: {str(e)}"
 # Gradio Interface
 with gr.Blocks(title="Step Audio R1") as demo: