Spaces:

stepfun-ai
/

Step-Audio-R1

Running

App Files Files Community

zhenjiangjie commited on Nov 25, 2025

Commit

3f8009c

1 Parent(s): 39b1e03

update

Browse files

Files changed (4) hide show

Dockerfile +6 -6
app.py +231 -357
requirements.txt +3 -1
start_services.sh +81 -0

Dockerfile CHANGED Viewed

@@ -11,16 +11,16 @@ COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 # 3. 然后复制所有应用文件
-COPY --chown=user start_gradio.sh app.py  .
-# 4. 如果需要，设置脚本权限
-RUN chmod +x start_gradio.sh
-# 暴露 Gradio 端口
-EXPOSE 7860
 # 设置环境变量
 ENV VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
 # 启动脚本
-CMD ["./start_gradio.sh"]

 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 # 3. 然后复制所有应用文件
+COPY --chown=user app.py start_services.sh ./
+# 4. 设置脚本权限
+RUN chmod +x start_services.sh
+# 暴露端口
+EXPOSE 7860 9999
 # 设置环境变量
 ENV VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
 # 启动脚本
+CMD ["./start_services.sh"]

app.py CHANGED Viewed

@@ -1,405 +1,279 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
-Gradio 多模态聊天界面：直接在 app.py 内部调用 vLLM.LLM 进行推理
 """
 import base64
 import os
-import sys
-import threading
-import time
-import traceback
-from typing import Optional, Tuple
 import gradio as gr
-# 检查命令行参数，在导入 vllm 之前确定是否启用
-# 这样可以在没有安装 vllm 的情况下运行界面预览
-if "--no-vllm" in sys.argv:
-    os.environ["ENABLE_VLLM"] = "false"
-# 检查是否启用 vLLM 模式
-ENABLE_VLLM = os.getenv("ENABLE_VLLM", "true").lower() in ("true", "1", "yes")
-if ENABLE_VLLM:
-    try:
-        from vllm import LLM, SamplingParams
-    except ImportError as err:
-        print("[WARNING] 无法导入 vllm，自动切换到界面预览模式")
-        print(f"[DETAIL] ImportError: {err}")
-        traceback.print_exc()
-        print("[INFO] 如需使用 vLLM，请确认容器环境已正确安装并可导入 vllm")
-        ENABLE_VLLM = False
-        LLM = None
-        SamplingParams = None
-else:
-    LLM = None
-    SamplingParams = None
-    print("[INFO] 运行在界面预览模式，不加载 vLLM")
-# 默认配置，可通过环境变量或 CLI 覆盖
-DEFAULT_MODEL_ID = os.getenv("MODEL_NAME", "stepfun-ai/Step-Audio-2-mini-Think")
-DEFAULT_MODEL_PATH = os.getenv("MODEL_PATH", DEFAULT_MODEL_ID)
-DEFAULT_TP = int(os.getenv("TENSOR_PARALLEL_SIZE", "4"))
-DEFAULT_MAX_MODEL_LEN = int(os.getenv("MAX_MODEL_LEN", "8192"))
-DEFAULT_GPU_UTIL = float(os.getenv("GPU_MEMORY_UTILIZATION", "0.9"))
-DEFAULT_TOKENIZER_MODE = os.getenv("TOKENIZER_MODE", "step_audio_2")
-DEFAULT_SERVED_NAME = os.getenv("SERVED_MODEL_NAME", "step-audio-2-mini-think")
-_llm: Optional[LLM] = None
-_llm_lock = threading.Lock()
-LLM_ARGS = {
-    "model": DEFAULT_MODEL_PATH,
-    "trust_remote_code": True,
-    "tensor_parallel_size": DEFAULT_TP,
-    "tokenizer_mode": DEFAULT_TOKENIZER_MODE,
-    "max_model_len": DEFAULT_MAX_MODEL_LEN,
-    "served_model_name": DEFAULT_SERVED_NAME,
-    "gpu_memory_utilization": DEFAULT_GPU_UTIL,
-}
-def encode_audio_to_base64(audio_path: Optional[str]) -> Optional[dict]:
-    """将音频文件编码为 base64"""
-    if audio_path is None:
         return None
     try:
-        with open(audio_path, "rb") as audio_file:
-            audio_data = audio_file.read()
-            audio_base64 = base64.b64encode(audio_data).decode('utf-8')
-            # 尝试从文件扩展名推断格式
-            ext = os.path.splitext(audio_path)[1].lower().lstrip('.')
-            if not ext:
-                ext = "wav"  # 默认格式
-            return {
-                "data": audio_base64,
-                "format": ext
-            }
     except Exception as e:
-        print(f"Error encoding audio: {e}")
         return None
-def format_messages(
-    system_prompt: str,
-    chat_history: list,
-    user_text: str,
-    audio_file: Optional[str]
-) -> list:
-    """格式化消息为 OpenAI API 格式"""
     messages = []
-    # 添加 system prompt
-    if system_prompt and system_prompt.strip():
         messages.append({
-            "role": "system",
-            "content": system_prompt.strip()
         })
-    # 添加历史对话
-    for human, assistant in chat_history:
-        if human:
-            messages.append({"role": "user", "content": human})
-        if assistant:
-            messages.append({"role": "assistant", "content": assistant})
-    # 添加���前用户输入
-    content_parts = []
-    # 添加文本输入
-    if user_text and user_text.strip():
-        content_parts.append({
-            "type": "text",
-            "text": user_text.strip()
         })
-    # 添加音频输入
-    if audio_file:
-        audio_data = encode_audio_to_base64(audio_file)
-        if audio_data:
-            content_parts.append({
-                "type": "input_audio",
-                "input_audio": audio_data
-            })
-    if content_parts:
-        # 如果只有一个文本部分，直接使用字符串
-        if len(content_parts) == 1 and content_parts[0]["type"] == "text":
-            messages.append({
-                "role": "user",
-                "content": content_parts[0]["text"]
-            })
-        else:
-            messages.append({
-                "role": "user",
-                "content": content_parts
-            })
     return messages
-def chat_predict(
-    system_prompt: str,
-    user_text: str,
-    audio_file: Optional[str],
-    chat_history: list,
-    max_tokens: int,
-    temperature: float,
-    top_p: float
-) -> Tuple[list, str]:
-    """调用本地 vLLM LLM 完成推理"""
     if not user_text and not audio_file:
-        return chat_history, "⚠ 请提供文本或音频输入"
-    # 如果是预览模式，返回模拟响应
-    if not ENABLE_VLLM:
-        user_display = user_text if user_text else "[音频输入]"
-        mock_response = f"这是一个模拟回复。您说: {user_text[:50] if user_text else '音频'}"
-        chat_history.append((user_display, mock_response))
-        return chat_history, ""
-    messages = format_messages(system_prompt, chat_history, user_text, audio_file)
     if not messages:
-        return chat_history, "⚠ 无有效输入"
     try:
-        llm = _get_llm()
-        sampling_params = SamplingParams(
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-        )
-        start_time = time.time()
-        outputs = llm.chat(messages, sampling_params=sampling_params, use_tqdm=False)
-        latency = time.time() - start_time
-        if not outputs or not outputs[0].outputs:
-            return chat_history, "⚠ 模型未返回结果"
-        assistant_message = outputs[0].outputs[0].text
-        user_display = user_text if user_text else "[音频输入]"
-        chat_history.append((user_display, assistant_message))
-        return chat_history, ""
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return chat_history, ""
-def _get_llm() -> LLM:
-    """单例方式初始化 LLM"""
-    if not ENABLE_VLLM:
-        raise RuntimeError("vLLM 未启用，无法加载模型")
-    global _llm
-    if _llm is not None:
-        return _llm
-    with _llm_lock:
-        if _llm is not None:
-            return _llm
-        print(f"[LLM] 初始化中，参数: {LLM_ARGS}")
-        _llm = LLM(**LLM_ARGS)
-    return _llm
-def _set_llm_args(**kwargs) -> None:
-    """更新 LLM 初始化参数"""
-    global LLM_ARGS, _llm
-    LLM_ARGS = kwargs
-    _llm = None  # 确保使用新配置重新加载
-# 构建 Gradio 界面
-with gr.Blocks(title="Step Audio 2 Chat", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # Step Audio R1 Demo
-        """
-    )
     with gr.Row():
-        # 左侧：参数配置
         with gr.Column(scale=1):
-            gr.Markdown("### 配置")
-            system_prompt = gr.Textbox(
-                label="System Prompt",
-                placeholder="输入系统提示词...",
-                lines=4,
-                value="You are an expert in audio analysis, please analyze the audio content and answer the questions accurately"
-            )
-            with gr.Row():
-                max_tokens = gr.Slider(
-                    label="Max Tokens",
-                    minimum=1,
-                    maximum=16384,
-                    value=8192,
-                    step=1
-                )
-            with gr.Row():
-                temperature = gr.Slider(
-                    label="Temperature",
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.7,
-                    step=0.1
                 )
-                top_p = gr.Slider(
-                    label="Top P",
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.9,
-                    step=0.05
-                )
-        # 右侧：对话和输入
-        with gr.Column(scale=1):
-            gr.Markdown("### 对话")
-            chatbot = gr.Chatbot(
-                label="聊天历史",
-                height=400,
-                show_copy_button=True,
-                type="messages"
-            )
-            user_text = gr.Textbox(
-                label="文本输入",
-                placeholder="输入您的消息...",
-                lines=2
-            )
-            audio_file = gr.Audio(
-                label="音频输入",
-                type="filepath",
-                sources=["microphone", "upload"]
-            )
             with gr.Row():
-                submit_btn = gr.Button("提交", variant="primary", size="lg")
-                clear_btn = gr.Button("清空", variant="secondary")
-            status_text = gr.Textbox(label="状态", interactive=False, visible=False)
-    # 事件绑定
     submit_btn.click(
-        fn=chat_predict,
-        inputs=[
-            system_prompt,
-            user_text,
-            audio_file,
-            chatbot,
-            max_tokens,
-            temperature,
-            top_p
-        ],
-        outputs=[chatbot, status_text]
     )
     clear_btn.click(
         fn=lambda: ([], "", None),
         outputs=[chatbot, user_text, audio_file]
     )
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description="Step Audio 2 Gradio Chat Interface")
-    parser.add_argument(
-        "--host",
-        type=str,
-        default="0.0.0.0",
-        help="服务器主机地址"
-    )
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=7860,
-        help="服务器端口"
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default=DEFAULT_MODEL_PATH,
-        help="模型名称或本地路径"
-    )
-    parser.add_argument(
-        "--tensor-parallel-size",
-        type=int,
-        default=DEFAULT_TP,
-        help="张量并行数量"
-    )
-    parser.add_argument(
-        "--max-model-len",
-        type=int,
-        default=DEFAULT_MAX_MODEL_LEN,
-        help="最大上下文长度"
-    )
-    parser.add_argument(
-        "--gpu-memory-utilization",
-        type=float,
-        default=DEFAULT_GPU_UTIL,
-        help="GPU 显存利用率"
-    )
-    parser.add_argument(
-        "--tokenizer-mode",
-        type=str,
-        default=DEFAULT_TOKENIZER_MODE,
-        help="tokenizer 模式"
-    )
-    parser.add_argument(
-        "--served-model-name",
-        type=str,
-        default=DEFAULT_SERVED_NAME,
-        help="对外暴露的模型名称"
-    )
-    parser.add_argument(
-        "--no-vllm",
-        action="store_true",
-        help="禁用 vLLM，仅启动界面预览模式"
-    )
     args = parser.parse_args()
-    # --no-vllm 参数已在文件开头处理，这里只是提示
-    if args.no_vllm and not ENABLE_VLLM:
-        print("[INFO] 已禁用 vLLM，运行在界面预览模式")
-    _set_llm_args(
-        model=args.model,
-        trust_remote_code=True,
-        tensor_parallel_size=args.tensor_parallel_size,
-        tokenizer_mode=args.tokenizer_mode,
-        max_model_len=args.max_model_len,
-        served_model_name=args.served_model_name,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-    )
-    print("==========================================")
-    print("Step Audio 2 Gradio Chat")
-    if ENABLE_VLLM:
-        print(f"模式: vLLM 推理模式")
-        print(f"模型: {args.model}")
-        print(f"Tensor Parallel Size: {args.tensor_parallel_size}")
-        print(f"Max Model Len: {args.max_model_len}")
-        print(f"Tokenizer Mode: {args.tokenizer_mode}")
-        print(f"Served Model Name: {args.served_model_name}")
-    else:
-        print(f"模式: 界面预览模式（无 vLLM）")
-    print(f"Gradio 地址: http://{args.host}:{args.port}")
-    print("==========================================")
-    demo.queue().launch(
-        server_name=args.host,
-        server_port=args.port,
-        share=False
-    )

 #!/usr/bin/env python3
 """
+Step Audio R1 vLLM Gradio Interface
 """
 import base64
+import json
 import os
 import gradio as gr
+import httpx
+API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:9999/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Step-Audio-R1")
+def encode_audio(audio_path):
+    """编码音频为base64"""
+    if not audio_path or not os.path.exists(audio_path):
         return None
     try:
+        with open(audio_path, "rb") as f:
+            return base64.b64encode(f.read()).decode()
     except Exception as e:
+        print(f"[DEBUG] Audio error: {e}")
         return None
+def format_messages(system, history, user_text, audio_data=None, audio_format="wav"):
+    """Format message list"""
     messages = []
+    if system:
+        messages.append({"role": "system", "content": system})
+    if not history:
+        history = []
+    # 处理历史记录
+    for item in history:
+        # 支持 list of dicts 格式
+        if isinstance(item, dict) and "role" in item and "content" in item:
+            messages.append(item)
+        # 支持 Gradio ChatMessage 对象
+        elif hasattr(item, "role") and hasattr(item, "content"):
+            messages.append({"role": item.role, "content": item.content})
+    # 添加当前用户消息
+    if user_text and audio_data:
         messages.append({
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_data,
+                        "format": audio_format
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": user_text
+                }
+            ]
         })
+    elif user_text:
+        messages.append({"role": "user", "content": user_text})
+    elif audio_data:
+        messages.append({
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_data,
+                        "format": audio_format
+                    }
+                }
+            ]
         })
     return messages
+def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature, top_p, model_name=None):
+    """Chat function"""
+    # If model is not specified, use global configuration
+    if model_name is None:
+        model_name = MODEL_NAME
     if not user_text and not audio_file:
+        return history or [], "Please enter text or upload audio"
+    # Ensure history is a list and formatted correctly
+    history = history or []
+    clean_history = []
+    for item in history:
+        if isinstance(item, dict) and 'role' in item and 'content' in item:
+            clean_history.append(item)
+        elif hasattr(item, "role") and hasattr(item, "content"):
+            # Keep ChatMessage object
+            clean_history.append(item)
+    history = clean_history
+    # Process audio
+    audio_data = None
+    audio_format = "wav"
+    if audio_file:
+        audio_data = encode_audio(audio_file)
+        if audio_file.lower().endswith(".mp3"):
+            audio_format = "mp3"
+    messages = format_messages(system_prompt, history, user_text, audio_data, audio_format)
     if not messages:
+        return history or [], "Invalid input"
+    # Debug: Print message format
+    print(f"[DEBUG] Messages to API: {json.dumps(messages, ensure_ascii=False, indent=2)}")
+    print(f"[DEBUG] Messages type: {type(messages)}")
+    for i, msg in enumerate(messages):
+        print(f"[DEBUG] Message {i}: {type(msg)} - {msg}")
     try:
+        with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
+            response = client.post("/chat/completions", json={
+                "model": model_name,
+                "messages": messages,
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "top_p": top_p,
+                "stream": True
+            })
+            if response.status_code != 200:
+                error_msg = f"❌ API Error {response.status_code}"
+                if response.status_code == 404:
+                    error_msg += " - vLLM service not ready"
+                elif response.status_code == 400:
+                    error_msg += " - Bad request"
+                elif response.status_code == 500:
+                    error_msg += " - Model error"
+                return history, error_msg
+            # Process streaming response
+            content_parts = []
+            for line in response.iter_lines():
+                if not line:
+                    continue
+                # Ensure line is string format
+                if isinstance(line, bytes):
+                    line = line.decode('utf-8')
+                else:
+                    line = str(line)
+                if line.startswith('data: '):
+                    data_str = line[6:]
+                    if data_str.strip() == '[DONE]':
+                        break
+                    try:
+                        data = json.loads(data_str)
+                        if 'choices' in data and len(data['choices']) > 0:
+                            delta = data['choices'][0].get('delta', {})
+                            if 'content' in delta:
+                                content_parts.append(delta['content'])
+                    except json.JSONDecodeError:
+                        continue
+            full_content = ''.join(content_parts)
+            # Update history - only add when no error
+            history = history or []
+            # Add user message
+            if audio_file:
+                # If audio exists, show audio file and text (if any)
+                # Gradio Chatbot supports tuple (file_path,) to show file
+                # But in messages format, we need to construct proper content
+                # Here we use tuple format to let Gradio render audio player, or use HTML
+                # Simpler way: if multimodal, add messages separately
+                # 1. Add audio message
+                history.append({"role": "user", "content": gr.Audio(audio_file)})
+                # 2. If text exists, add text message
+                if user_text:
+                    history.append({"role": "user", "content": user_text})
+            else:
+                # Text only
+                history.append({"role": "user", "content": user_text})
+            # Split think and content
+            if "</think>" in full_content:
+                parts = full_content.split("</think>", 1)
+                think_content = parts[0].strip()
+                response_content = parts[1].strip()
+                # Remove possible start tag
+                if think_content.startswith("<think>"):
+                    think_content = think_content[len("<think>"):].strip()
+                # Add thinking process message (use ChatMessage and metadata)
+                if think_content:
+                    history.append(gr.ChatMessage(
+                        role="assistant",
+                        content=think_content,
+                        metadata={"title": "⏳ Thinking Process"}
+                    ))
+                # Add formal response message
+                if response_content:
+                    history.append({"role": "assistant", "content": response_content})
+            else:
+                # No think tag, add full response directly
+                assistant_text = full_content.strip()
+                if assistant_text:
+                    history.append({"role": "assistant", "content": assistant_text})
+            return history, ""
+    except httpx.ConnectError:
+        return history, "❌ Cannot connect to vLLM API"
+    except Exception as e:
+        return history, f"❌ Error: {str(e)}"
+# Gradio Interface
+with gr.Blocks(title="Step Audio R1") as demo:
+    gr.Markdown("# Step Audio R1 Chat")
     with gr.Row():
+        # Left Configuration
         with gr.Column(scale=1):
+            with gr.Accordion("Configuration", open=True):
+                system_prompt = gr.Textbox(
+                    label="System Prompt",
+                    lines=2,
+                    value="You are an audio analysis expert"
                 )
+                max_tokens = gr.Slider(1, 8192, value=1024, label="Max Tokens")
+                temperature = gr.Slider(0.0, 2.0, value=0.7, label="Temperature")
+                top_p = gr.Slider(0.0, 1.0, value=0.9, label="Top P")
+            status = gr.Textbox(label="Status", interactive=False)
+        # Right Chat
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Chat History", height=450)
+            user_text = gr.Textbox(label="Input", lines=2, placeholder="Enter message...")
+            audio_file = gr.Audio(label="Audio", type="filepath", sources=["microphone", "upload"])
             with gr.Row():
+                submit_btn = gr.Button("Send", variant="primary", scale=2)
+                clear_btn = gr.Button("Clear", scale=1)
+    # 事件绑定 - 函数将在启动时定义
+    # 直接绑定 chat 函数；不要传递外部的 `model_to_use`，chat 使用默认的 `MODEL_NAME` 或内部参数
     submit_btn.click(
+        fn=chat,
+        inputs=[system_prompt, user_text, audio_file, chatbot, max_tokens, temperature, top_p],
+        outputs=[chatbot, status]
     )
     clear_btn.click(
         fn=lambda: ([], "", None),
         outputs=[chatbot, user_text, audio_file]
     )
 if __name__ == "__main__":
     import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=7860)
+    parser.add_argument("--model", default=MODEL_NAME)
     args = parser.parse_args()
+    # 更新全局模型名称
+    if args.model:
+        MODEL_NAME = args.model
+    print(f"启动Gradio: http://{args.host}:{args.port}")
+    print(f"API地址: {API_BASE_URL}")
+    print(f"模型: {MODEL_NAME}")
+    demo.launch(server_name=args.host, server_port=args.port, share=False)

requirements.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- gradio>=4.0.0

+gradio>=4.0.0
+httpx
+huggingface_hub

start_services.sh ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/bin/bash
+set -euo pipefail
+# 配置
+MODEL_REPO="${MODEL_REPO:-stepfun-ai/Step-Audio-R1}"
+MODEL_DIR="${MODEL_DIR:-/tmp/models/Step-Audio-R1}"
+API_PORT="${API_PORT:-9999}"
+GRADIO_PORT="${GRADIO_PORT:-7860}"
+echo "Starting Step Audio R1 services..."
+echo "Model: $MODEL_REPO"
+echo "Model Dir: $MODEL_DIR"
+echo "API Port: $API_PORT"
+# 下载模型（如果需要）
+if [[ ! -d "$MODEL_DIR" ]] || [[ ! -f "$MODEL_DIR/config.json" ]]; then
+    echo "Downloading model to: $MODEL_DIR"
+    mkdir -p "$MODEL_DIR"
+    if command -v hf &> /dev/null; then
+        hf download "$MODEL_REPO" --local-dir "$MODEL_DIR"
+    elif command -v huggingface-cli &> /dev/null; then
+        huggingface-cli download "$MODEL_REPO" --local-dir "$MODEL_DIR" --local-dir-use-symlinks False
+    else
+        echo "Neither hf nor huggingface-cli found. Skipping download."
+        exit 1
+    fi
+    echo "✓ Model downloaded"
+else
+    echo "✓ Model already exists locally"
+fi
+# Step-Audio-R1 的 chat template
+CHAT_TEMPLATE='{%- macro render_content(content) -%}{%- if content is string -%}{{- content.replace("<audio_patch>\\n", "<audio_patch>") -}}{%- elif content is mapping -%}{{- content["'"'"'value'"'"'] if '"'"'value'"'"' in content else content["'"'"'text'"'"'] -}}{%- elif content is iterable -%}{%- for item in content -%}{%- if item.type == '"'"'text'"'"' -%}{{- item["'"'"'value'"'"'] if '"'"'value'"'"' in item else item["'"'"'text'"'"'] -}}{%- elif item.type == '"'"'audio'"'"' -%}<audio_patch>{%- endif -%}{%- endfor -%}{%- endif -%}{%- endmacro -%}{%- if tools -%}{{- '"'"'<|BOT|>system\\n'"'"' -}}{%- if messages[0]["'"'"'role'"'"'] == '"'"'system'"'"' -%}{{- render_content(messages[0]["'"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{{- '"'"'<|BOT|>tool_json_schemas\\n'"'"' + tools|tojson + '"'"'<|EOT|>'"'"' -}}{%- else -%}{%- if messages[0]["'"'"'role'"'"'] == '"'"'system'"'"' -%}{{- '"'"'<|BOT|>system\\n'"'"' + render_content(messages[0]["'"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message["role"] == "user" -%}{{- '"'"'<|BOT|>human\\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- elif message["role"] == "assistant" -%}{{- '"'"'<|BOT|>assistant\\n'"'"' + (render_content(message["content"]) if message["content"] else '"'"''"'"') -}}{%- set is_last_assistant = true -%}{%- for m in messages[loop.index:] -%}{%- if m["role"] == "assistant" -%}{%- set is_last_assistant = false -%}{%- endif -%}{%- endfor -%}{%- if not is_last_assistant -%}{{- '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- elif message["role"] == "function_output" -%}{%- else -%}{%- if not (loop.first and message["role"] == "system") -%}{{- '"'"'<|BOT|>'"'"' + message["role"] + '"'"'\\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- '"'"'<|BOT|>assistant\\n'"'"' -}}{%- endif -%}'
+# 后台启动 vLLM API
+python3 -m vllm.entrypoints.openai.api_server \
+    --model "$MODEL_DIR" \
+    --port "$API_PORT" \
+    --host 0.0.0.0 \
+    --max-model-len 65536 \
+    --tensor-parallel-size 4 \
+    --gpu-memory-utilization 0.85 \
+    --trust-remote-code \
+    --interleave-mm-strings \
+    --chat-template "$CHAT_TEMPLATE" \
+    &
+VLLM_PID=$!
+echo "vLLM started (PID: $VLLM_PID)"
+# 等待 vLLM 就绪
+echo "Waiting for vLLM to be ready..."
+for i in {1..30}; do
+    if curl -s "http://localhost:$API_PORT/v1/models" > /dev/null 2>&1; then
+        echo "✓ vLLM is ready (checked $i/30 times)"
+        break
+    fi
+    if [ $i -eq 30 ]; then
+        echo "❌ vLLM startup timeout after 60 seconds"
+        echo "Checking vLLM process:"
+        ps aux | grep "vllm.entrypoints.openai.api_server" || echo "vLLM process not found"
+        echo "Port $API_PORT status:"
+        netstat -tlnp | grep ":$API_PORT " || echo "Port $API_PORT not listening"
+        exit 1
+    fi
+    echo "Waiting for vLLM... ($i/30)"
+    sleep 2
+done
+# 启动 Gradio (前台运行)
+export API_BASE_URL="http://localhost:$API_PORT/v1"
+export MODEL_NAME="Step-Audio-R1"
+python3 app.py --host 0.0.0.0 --port "$GRADIO_PORT"
+# 清理
+trap 'kill $VLLM_PID' EXIT