Spaces:

reach-vb
/

Step-Audio-2-mini

Runtime error

App Files Files Community

reach-vb commited on Aug 29, 2025

Commit

d17cd30

verified ·

1 Parent(s): d668318

Create app.py (#1)

Browse files

- Create app.py (82717c1648f7ab63b6e686bcc8363ce7c2f4ff89)

Files changed (1) hide show

app.py +151 -0

app.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import tempfile
+import traceback
+from pathlib import Path
+import gradio as gr
+import spaces  # required for ZeroGPU
+# ---- Your model libs (ensure these are available in the repo or pip) ----
+from stepaudio2 import StepAudio2
+from token2wav import Token2wav
+# ------------------------- constants -------------------------
+MODEL_PATH = "Step-Audio-2-mini"
+PROMPT_WAV = "assets/default_female.wav"
+CACHE_DIR = "/tmp/stepaudio2"
+# Ensure Gradio uses a writable temp dir on Spaces
+os.environ["GRADIO_TEMP_DIR"] = CACHE_DIR
+Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
+# ------------------------- helpers -------------------------
+def save_tmp_audio(audio_bytes: bytes, cache_dir: str) -> str:
+    Path(cache_dir).mkdir(parents=True, exist_ok=True)
+    with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False, suffix=".wav") as f:
+        f.write(audio_bytes)
+    return f.name
+def add_message(chatbot, history, mic, text):
+    if not mic and not text:
+        return chatbot, history, "Input is empty"
+    if text:
+        chatbot.append({"role": "user", "content": text})
+        history.append({"role": "human", "content": text})
+    elif mic and Path(mic).exists():
+        chatbot.append({"role": "user", "content": {"path": mic}})
+        history.append({"role": "human", "content": [{"type": "audio", "audio": mic}]})
+    return chatbot, history, None
+def reset_state(system_prompt):
+    return [], [{"role": "system", "content": system_prompt}]
+# ------------------------- globals -------------------------
+AUDIO_MODEL = StepAudio2(MODEL_PATH)                 # load on CPU
+TOKEN2WAV = Token2wav(f"{MODEL_PATH}/token2wav")     # load on CPU
+@spaces.GPU(duration=120)  # GPU only during this call; no-ops outside ZeroGPU
+def gpu_predict(chatbot, history):
+    global AUDIO_MODEL, TOKEN2WAV
+    try:
+        # Move to CUDA only when GPU is attached
+        try:
+            if hasattr(AUDIO_MODEL, "to"):
+                AUDIO_MODEL.to("cuda")
+            if hasattr(TOKEN2WAV, "to"):
+                TOKEN2WAV.to("cuda")
+        except Exception:
+            pass
+        history.append({"role": "assistant", "content": [{"type": "text", "text": "<tts_start>"}], "eot": False})
+        tokens, text, audio_tokens = AUDIO_MODEL(
+            history,
+            max_new_tokens=4096,
+            temperature=0.7,
+            repetition_penalty=1.05,
+            do_sample=True,
+        )
+        audio_bytes = TOKEN2WAV(audio_tokens, PROMPT_WAV)
+        audio_path = save_tmp_audio(audio_bytes, CACHE_DIR)
+        chatbot.append({"role": "assistant", "content": {"path": audio_path}})
+        history[-1]["content"].append({"type": "token", "token": tokens})
+        history[-1]["eot"] = True
+    except Exception:
+        print(traceback.format_exc())
+        gr.Warning("Some error happened, please try again.")
+    return chatbot, history
+def build_demo():
+    with gr.Blocks(delete_cache=(86400, 86400)) as demo:
+        gr.Markdown("<center><font size=8>Step Audio 2 Demo</center>")
+        with gr.Row():
+            system_prompt = gr.Textbox(
+                label="System Prompt",
+                value=(
+                    "你的名字叫做小跃，是由阶跃星辰公司训练出来的语音大模型。\n"
+                    "你情感细腻，观察能力强，擅长分析用户的内容，并作出善解人意的回复，"
+                    "说话的过程中时刻注意用户的感受，富有同理心，提供多样的情绪价值。\n"
+                    "今天是2025年8月29日，星期五\n"
+                    "请用默认女声与用户交流。"
+                ),
+                lines=2,
+            )
+        chatbot = gr.Chatbot(elem_id="chatbot", min_height=800, type="messages")
+        history = gr.State([{"role": "system", "content": system_prompt.value}])
+        mic = gr.Audio(type="filepath", label="🎙️ Microphone input (optional)")
+        text = gr.Textbox(placeholder="Enter message ...", label="💬 Text input")
+        with gr.Row():
+            clean_btn = gr.Button("🧹 Clear History (清除历史)")
+            regen_btn = gr.Button("🤔️ Regenerate (重试)")
+            submit_btn = gr.Button("🚀 Submit")
+        def on_submit(chatbot, history, mic, text):
+            chatbot, history, error = add_message(chatbot, history, mic, text)
+            if error:
+                gr.Warning(error)
+                return chatbot, history, None, None
+            chatbot, history = gpu_predict(chatbot, history)
+            return chatbot, history, None, None
+        submit_btn.click(
+            fn=on_submit,
+            inputs=[chatbot, history, mic, text],
+            outputs=[chatbot, history, mic, text],
+            concurrency_limit=4,
+            concurrency_id="gpu_queue",
+        )
+        clean_btn.click(
+            fn=reset_state,
+            inputs=[system_prompt],
+            outputs=[chatbot, history],
+        )
+        def regenerate(chatbot, history):
+            while chatbot and chatbot[-1]["role"] == "assistant":
+                chatbot.pop()
+            while history and history[-1]["role"] == "assistant":
+                history.pop()
+            return gpu_predict(chatbot, history)
+        regen_btn.click(
+            regenerate,
+            [chatbot, history],
+            [chatbot, history],
+            concurrency_id="gpu_queue",
+        )
+    return demo
+# Spaces runs this file; just build and launch with defaults (no ports/names).
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.queue().launch()  # no args — Spaces handles host/port