Spaces:

Steveeeeeeen
/

Step-Audio-2-mini

Running on Zero

App Files Files Community

Steveeeeeeen HF Staff commited on Aug 29

Commit

676ffac

verified ·

1 Parent(s): e8f2ced

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -72

app.py CHANGED Viewed

@@ -1,31 +1,28 @@
 import os
 import shlex
 import subprocess
-# install requirements
 os.system("pip install -r requirements.txt")
-# wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/blob/main/token2wav/campplus.onnx in token2wav folder
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/campplus.onnx -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/flow.pt -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/flow.yaml -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/hift.pt -P token2wav")
-# get hf token
 hf_token = os.getenv("HF_TOKEN", None)
 os.environ["HF_TOKEN"] = hf_token
-import tempfile
-import traceback
-from pathlib import Path
 import spaces
 import gradio as gr
-def save_tmp_audio(audio, cache_dir):
-    with tempfile.NamedTemporaryFile(
-        dir=cache_dir, delete=False, suffix=".wav"
-    ) as temp_audio:
-        temp_audio.write(audio)
     return temp_audio.name
 def add_message(chatbot, history, mic, text):
@@ -37,68 +34,122 @@ def add_message(chatbot, history, mic, text):
         history.append({"role": "human", "content": text})
     elif mic and Path(mic).exists():
         chatbot.append({"role": "user", "content": {"path": mic}})
-        history.append({"role": "human", "content": [{"type":"audio", "audio": mic}]})
-    print(f"{history=}")
     return chatbot, history, None
 def reset_state(system_prompt):
     return [], [{"role": "system", "content": system_prompt}]
 @spaces.GPU
-def predict(chatbot, history, audio_model, token2wav, prompt_wav, cache_dir):
     try:
-        history.append({"role": "assistant", "content": [{"type": "text", "text": "<tts_start>"}], "eot": False})
-        tokens, text, audio = audio_model(history, max_new_tokens=4096, temperature=0.7, repetition_penalty=1.05, do_sample=True)
-        print(f"predict {text=}")
-        audio = token2wav(audio, prompt_wav)
-        audio_path = save_tmp_audio(audio, cache_dir)
         chatbot.append({"role": "assistant", "content": {"path": audio_path}})
         history[-1]["content"].append({"type": "token", "token": tokens})
         history[-1]["eot"] = True
     except Exception:
         print(traceback.format_exc())
-        gr.Warning(f"Some error happend, please try again.")
     return chatbot, history
-def _launch_demo(args, audio_model, token2wav):
     with gr.Blocks(delete_cache=(86400, 86400)) as demo:
-        gr.Markdown("""<center><font size=8>Step Audio 2 Demo</center>""")
         with gr.Row():
             system_prompt = gr.Textbox(
                 label="System Prompt",
-                value="你的名字叫做小跃，是由阶跃星辰公司训练出来的语音大模型。\n你情感细腻，观察能力强，擅长分析用户的内容，并作出善解人意的回复，说话的过程中时刻注意用户的感受，富有同理心，提供多样的情绪价值。\n今天是2025年8月29日，星期五\n请用默认女声与用户交流。",
                 lines=2
             )
-        chatbot = gr.Chatbot(
-            elem_id="chatbot",
-            #avatar_images=["assets/user.png", "assets/assistant.png"],
-            min_height=800,
-            type="messages",
-        )
         history = gr.State([{"role": "system", "content": system_prompt.value}])
         mic = gr.Audio(type="filepath")
         text = gr.Textbox(placeholder="Enter message ...")
         with gr.Row():
             clean_btn = gr.Button("🧹 Clear History (清除历史)")
             regen_btn = gr.Button("🤔️ Regenerate (重试)")
             submit_btn = gr.Button("🚀 Submit")
-        def on_submit(chatbot, history, mic, text):
-            chatbot, history, error = add_message(
-                chatbot, history, mic, text
-            )
             if error:
-                gr.Warning(error)  # 显示警告消息
-                return chatbot, history, None, None
-            else:
-                chatbot, history = predict(chatbot, history, audio_model, token2wav, args.prompt_wav, args.cache_dir)
                 return chatbot, history, None, None
         submit_btn.click(
             fn=on_submit,
-            inputs=[chatbot, history, mic, text],
             outputs=[chatbot, history, mic, text],
             concurrency_limit=4,
             concurrency_id="gpu_queue",
@@ -108,55 +159,39 @@ def _launch_demo(args, audio_model, token2wav):
             fn=reset_state,
             inputs=[system_prompt],
             outputs=[chatbot, history],
-            #show_progress=True,
         )
-        def regenerate(chatbot, history):
             while chatbot and chatbot[-1]["role"] == "assistant":
                 chatbot.pop()
             while history and history[-1]["role"] == "assistant":
-                print(f"discard {history[-1]}")
                 history.pop()
-            return predict(chatbot, history, audio_model, token2wav, args.prompt_wav, args.cache_dir)
         regen_btn.click(
-            regenerate,
-            [chatbot, history],
-            [chatbot, history],
-            #show_progress=True,
             concurrency_id="gpu_queue",
         )
-    demo.queue().launch(
-        server_port=args.server_port,
-        server_name=args.server_name,
-    )
 if __name__ == "__main__":
-    import os
     from argparse import ArgumentParser
-    from stepaudio2 import StepAudio2
-    from token2wav import Token2wav
     parser = ArgumentParser()
-    parser.add_argument("--model-path", type=str, default='Step-Audio-2-mini', help="Model path.")
-    parser.add_argument(
-        "--server-port", type=int, default=7860, help="Demo server port."
-    )
-    parser.add_argument(
-        "--server-name", type=str, default="0.0.0.0", help="Demo server name."
-    )
-    parser.add_argument(
-        "--prompt-wav", type=str, default="assets/default_female.wav", help="Prompt wave for the assistant."
-    )
-    parser.add_argument(
-        "--cache-dir", type=str, default="/tmp/stepaudio2", help="Cache directory."
-    )
     args = parser.parse_args()
     os.environ["GRADIO_TEMP_DIR"] = args.cache_dir
-    audio_model = StepAudio2(args.model_path)
-    token2wav = Token2wav("token2wav")
-    _launch_demo(args, audio_model, token2wav)

 import os
 import shlex
 import subprocess
+import threading
+import tempfile
+import traceback
+from pathlib import Path
 os.system("pip install -r requirements.txt")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/campplus.onnx -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/flow.pt -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/flow.yaml -P token2wav")
 os.system("wget https://huggingface.co/stepfun-ai/Step-Audio-2-mini/resolve/main/token2wav/hift.pt -P token2wav")
+# HF token passthrough
 hf_token = os.getenv("HF_TOKEN", None)
 os.environ["HF_TOKEN"] = hf_token
 import spaces
 import gradio as gr
+def save_tmp_audio(audio_bytes, cache_dir):
+    os.makedirs(cache_dir, exist_ok=True)
+    with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False, suffix=".wav") as temp_audio:
+        temp_audio.write(audio_bytes)
     return temp_audio.name
 def add_message(chatbot, history, mic, text):
         history.append({"role": "human", "content": text})
     elif mic and Path(mic).exists():
         chatbot.append({"role": "user", "content": {"path": mic}})
+        history.append({"role": "human", "content": [{"type": "audio", "audio": mic}]})
     return chatbot, history, None
 def reset_state(system_prompt):
     return [], [{"role": "system", "content": system_prompt}]
+_AUDIO_MODEL = None
+_TOKEN2WAV = None
+_INIT_LOCK = threading.Lock()
+def _ensure_models(model_path: str, token2wav_dir: str):
+    """
+    Create heavy, non-picklable objects *inside* the worker process exactly once.
+    """
+    global _AUDIO_MODEL, _TOKEN2WAV
+    if _AUDIO_MODEL is None or _TOKEN2WAV is None:
+        with _INIT_LOCK:
+            if _AUDIO_MODEL is None or _TOKEN2WAV is None:
+                # Import here to avoid importing before process fork
+                from stepaudio2 import StepAudio2
+                from token2wav import Token2wav
+                # Create non-picklable instances
+                _AUDIO_MODEL = StepAudio2(model_path)
+                _TOKEN2WAV = Token2wav(token2wav_dir)
+    return _AUDIO_MODEL, _TOKEN2WAV
 @spaces.GPU
+def predict(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir):
+    """
+    IMPORTANT: All parameters are simple strings/lists (picklable).
+    Heavy objects are created inside via _ensure_models(...).
+    """
     try:
+        audio_model, token2wav = _ensure_models(model_path, token2wav_dir)
+        # Stream start marker
+        history.append({
+            "role": "assistant",
+            "content": [{"type": "text", "text": "<tts_start>"}],
+            "eot": False
+        })
+        # Your original generation call
+        tokens, text, audio_tokens = audio_model(
+            history,
+            max_new_tokens=4096,
+            temperature=0.7,
+            repetition_penalty=1.05,
+            do_sample=True
+        )
+        # Convert tokens -> wav bytes
+        audio_bytes = token2wav(audio_tokens, prompt_wav)
+        # Save to temp file for gradio Chatbot
+        audio_path = save_tmp_audio(audio_bytes, cache_dir)
         chatbot.append({"role": "assistant", "content": {"path": audio_path}})
+        # Finish the assistant turn
         history[-1]["content"].append({"type": "token", "token": tokens})
         history[-1]["eot"] = True
     except Exception:
         print(traceback.format_exc())
+        gr.Warning("Some error happened, please try again.")
     return chatbot, history
+def _launch_demo(args):
     with gr.Blocks(delete_cache=(86400, 86400)) as demo:
+        gr.Markdown("""<center><font size=8>Step Audio 2 Demo</font></center>""")
         with gr.Row():
             system_prompt = gr.Textbox(
                 label="System Prompt",
+                value=(
+                    "你的名字叫做小跃，是由阶跃星辰公司训练出来的语音大模型。\n"
+                    "你情感细腻，观察能力强，擅长分析用户的内容，并作出善解人意的回复，说话的过程中时刻注意用户的感受，富有同理心，提供多样的情绪价值。\n"
+                    "今天是2025年8月29日，星期五\n"
+                    "请用默认女声与用户交流。"
+                ),
                 lines=2
             )
+        chatbot = gr.Chatbot(elem_id="chatbot", min_height=800, type="messages")
+        # Initialize history with the *string* value of the prompt
         history = gr.State([{"role": "system", "content": system_prompt.value}])
+        # Inputs
         mic = gr.Audio(type="filepath")
         text = gr.Textbox(placeholder="Enter message ...")
+        # Serializable configuration inputs (STRINGS ONLY)
+        model_path = gr.Textbox(value="Step-Audio-2-mini", label="Model path")
+        token2wav_dir = gr.Textbox(value="token2wav", label="Token2Wav directory")
+        prompt_wav = gr.Textbox(value="assets/default_female.wav", label="Prompt WAV path")
+        cache_dir = gr.Textbox(value="/tmp/stepaudio2", label="Cache directory")
         with gr.Row():
             clean_btn = gr.Button("🧹 Clear History (清除历史)")
             regen_btn = gr.Button("🤔️ Regenerate (重试)")
             submit_btn = gr.Button("🚀 Submit")
+        # --- event functions (now only use serializable args) ---
+        def on_submit(chatbot, history, mic, text, prompt_wav, cache_dir, model_path, token2wav_dir):
+            chatbot, history, error = add_message(chatbot, history, mic, text)
             if error:
+                gr.Warning(error)
                 return chatbot, history, None, None
+            chatbot, history = predict(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir)
+            return chatbot, history, None, None
         submit_btn.click(
             fn=on_submit,
+            inputs=[chatbot, history, mic, text, prompt_wav, cache_dir, model_path, token2wav_dir],
             outputs=[chatbot, history, mic, text],
             concurrency_limit=4,
             concurrency_id="gpu_queue",
             fn=reset_state,
             inputs=[system_prompt],
             outputs=[chatbot, history],
         )
+        def on_regen(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir):
+            # drop last assistant turn so we can re-run
             while chatbot and chatbot[-1]["role"] == "assistant":
                 chatbot.pop()
             while history and history[-1]["role"] == "assistant":
                 history.pop()
+            return predict(chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir)
         regen_btn.click(
+            fn=on_regen,
+            inputs=[chatbot, history, prompt_wav, cache_dir, model_path, token2wav_dir],
+            outputs=[chatbot, history],
             concurrency_id="gpu_queue",
         )
+    demo.queue().launch(server_port=args.server_port, server_name=args.server_name)
 if __name__ == "__main__":
     from argparse import ArgumentParser
     parser = ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="Step-Audio-2-mini", help="Model path.")
+    parser.add_argument("--server-port", type=int, default=7860, help="Demo server port.")
+    parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Demo server name.")
+    parser.add_argument("--prompt-wav", type=str, default="assets/default_female.wav", help="Prompt wave for the assistant.")
+    parser.add_argument("--cache-dir", type=str, default="/tmp/stepaudio2", help="Cache directory.")
     args = parser.parse_args()
     os.environ["GRADIO_TEMP_DIR"] = args.cache_dir
+    os.makedirs(args.cache_dir, exist_ok=True)
+    # NOTE: Do NOT instantiate heavy models here.
+    # They will be created lazily inside predict() via _ensure_models(...).
+    _launch_demo(args)