Spaces:

Steveeeeeeen
/

Step-Audio-2-mini

Running on Zero

App Files Files Community

Steveeeeeeen HF Staff commited on Aug 29, 2025

Commit

876f2fc

verified ·

1 Parent(s): b23448b

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -4

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import shlex
 import subprocess
@@ -22,6 +24,10 @@ if hf_token is not None:
 import spaces
 import gradio as gr
 def save_tmp_audio(audio_bytes: bytes, cache_dir: str) -> str:
     """Save raw wav bytes to a temporary file and return path."""
     os.makedirs(cache_dir, exist_ok=True)
@@ -51,6 +57,9 @@ def reset_state(system_prompt: str):
     return [], [{"role": "system", "content": system_prompt}]
 _MODEL = None
 _TOK2WAV = None
@@ -61,14 +70,19 @@ def _get_models(model_path: str):
     """
     global _MODEL, _TOK2WAV
     if _MODEL is None or _TOK2WAV is None:
         from stepaudio2 import StepAudio2
         from token2wav import Token2wav
         _MODEL = StepAudio2(model_path)
         _TOK2WAV = Token2wav("token2wav")
     return _MODEL, _TOK2WAV
 @spaces.GPU
-def predict(chatbot, history, cache_dir, model_path="Step-Audio-2-mini"):
     """
     Run generation on GPU worker. All args must be picklable (strings, lists, dicts).
     Heavy models are created via _get_models() inside this process.
@@ -92,7 +106,7 @@ def predict(chatbot, history, cache_dir, model_path="Step-Audio-2-mini"):
         print(f"predict text={text!r}")
         # Convert tokens -> waveform bytes using token2wav
-        audio_bytes = token2wav(audio_tokens)
         # Persist to temp .wav for the UI
         audio_path = save_tmp_audio(audio_bytes, cache_dir)
@@ -108,6 +122,10 @@ def predict(chatbot, history, cache_dir, model_path="Step-Audio-2-mini"):
     return chatbot, history
 def _launch_demo(args):
     with gr.Blocks(delete_cache=(86400, 86400)) as demo:
         gr.Markdown("""<center><font size=8>Step Audio 2 Demo</font></center>""")
@@ -131,6 +149,7 @@ def _launch_demo(args):
             type="messages",
         )
         history = gr.State([{"role": "system", "content": system_prompt.value}])
         mic = gr.Audio(type="filepath", label="🎤 Speak (optional)")
@@ -146,9 +165,10 @@ def _launch_demo(args):
             if error:
                 gr.Warning(error)
                 return chatbot2, history2, None, None
             chatbot2, history2 = predict(
                 chatbot2, history2,
-                args.cache_dir,
                 model_path=args.model_path
             )
             return chatbot2, history2, None, None
@@ -171,6 +191,7 @@ def _launch_demo(args):
         )
         def on_regenerate(chatbot_val, history_val):
             while chatbot_val and chatbot_val[-1]["role"] == "assistant":
                 chatbot_val.pop()
             while history_val and history_val[-1]["role"] == "assistant":
@@ -178,7 +199,7 @@ def _launch_demo(args):
                 history_val.pop()
             return predict(
                 chatbot_val, history_val,
-                args.cache_dir,
                 model_path=args.model_path
             )
@@ -194,6 +215,10 @@ def _launch_demo(args):
             server_name=args.server_name,
         )
 if __name__ == "__main__":
     from argparse import ArgumentParser
@@ -201,6 +226,7 @@ if __name__ == "__main__":
     parser.add_argument("--model-path", type=str, default="Step-Audio-2-mini", help="Model path.")
     parser.add_argument("--server-port", type=int, default=7860, help="Demo server port.")
     parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Demo server name.")
     parser.add_argument("--cache-dir", type=str, default="/tmp/stepaudio2", help="Cache directory.")
     args = parser.parse_args()

+# app.py
 import os
 import shlex
 import subprocess
 import spaces
 import gradio as gr
+# -----------------------
+# Utility helpers
+# -----------------------
 def save_tmp_audio(audio_bytes: bytes, cache_dir: str) -> str:
     """Save raw wav bytes to a temporary file and return path."""
     os.makedirs(cache_dir, exist_ok=True)
     return [], [{"role": "system", "content": system_prompt}]
+# -----------------------
+# Lazy model loading inside the GPU worker
+# -----------------------
 _MODEL = None
 _TOK2WAV = None
     """
     global _MODEL, _TOK2WAV
     if _MODEL is None or _TOK2WAV is None:
+        # Import here so the objects are constructed in the worker
         from stepaudio2 import StepAudio2
         from token2wav import Token2wav
         _MODEL = StepAudio2(model_path)
         _TOK2WAV = Token2wav("token2wav")
     return _MODEL, _TOK2WAV
+# -----------------------
+# Inference
+# -----------------------
 @spaces.GPU
+def predict(chatbot, history, prompt_wav, cache_dir, model_path="Step-Audio-2-mini"):
     """
     Run generation on GPU worker. All args must be picklable (strings, lists, dicts).
     Heavy models are created via _get_models() inside this process.
         print(f"predict text={text!r}")
         # Convert tokens -> waveform bytes using token2wav
+        audio_bytes = token2wav(audio_tokens, prompt_wav)
         # Persist to temp .wav for the UI
         audio_path = save_tmp_audio(audio_bytes, cache_dir)
     return chatbot, history
+# -----------------------
+# UI
+# -----------------------
 def _launch_demo(args):
     with gr.Blocks(delete_cache=(86400, 86400)) as demo:
         gr.Markdown("""<center><font size=8>Step Audio 2 Demo</font></center>""")
             type="messages",
         )
+        # Initialize history with current system prompt value
         history = gr.State([{"role": "system", "content": system_prompt.value}])
         mic = gr.Audio(type="filepath", label="🎤 Speak (optional)")
             if error:
                 gr.Warning(error)
                 return chatbot2, history2, None, None
+            # Run GPU inference with only picklable args
             chatbot2, history2 = predict(
                 chatbot2, history2,
+                args.prompt_wav, args.cache_dir,
                 model_path=args.model_path
             )
             return chatbot2, history2, None, None
         )
         def on_regenerate(chatbot_val, history_val):
+            # Drop last assistant turn(s) to regenerate
             while chatbot_val and chatbot_val[-1]["role"] == "assistant":
                 chatbot_val.pop()
             while history_val and history_val[-1]["role"] == "assistant":
                 history_val.pop()
             return predict(
                 chatbot_val, history_val,
+                args.prompt_wav, args.cache_dir,
                 model_path=args.model_path
             )
             server_name=args.server_name,
         )
+# -----------------------
+# Entrypoint
+# -----------------------
 if __name__ == "__main__":
     from argparse import ArgumentParser
     parser.add_argument("--model-path", type=str, default="Step-Audio-2-mini", help="Model path.")
     parser.add_argument("--server-port", type=int, default=7860, help="Demo server port.")
     parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Demo server name.")
+    parser.add_argument("--prompt-wav", type=str, default="assets/default_female.wav", help="Prompt wave for the assistant.")
     parser.add_argument("--cache-dir", type=str, default="/tmp/stepaudio2", help="Cache directory.")
     args = parser.parse_args()