Spaces:

Steveeeeeeen
/

Step-Audio-2-mini

Running on Zero

App Files Files Community

Steveeeeeeen HF Staff commited on Aug 29, 2025

Commit

b23448b

verified ·

1 Parent(s): a9a5df1

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -9

app.py CHANGED Viewed

@@ -61,7 +61,6 @@ def _get_models(model_path: str):
     """
     global _MODEL, _TOK2WAV
     if _MODEL is None or _TOK2WAV is None:
-        # Import here so the objects are constructed in the worker
         from stepaudio2 import StepAudio2
         from token2wav import Token2wav
         _MODEL = StepAudio2(model_path)
@@ -69,7 +68,7 @@ def _get_models(model_path: str):
     return _MODEL, _TOK2WAV
 @spaces.GPU
-def predict(chatbot, history, prompt_wav, cache_dir, model_path="Step-Audio-2-mini"):
     """
     Run generation on GPU worker. All args must be picklable (strings, lists, dicts).
     Heavy models are created via _get_models() inside this process.
@@ -93,7 +92,7 @@ def predict(chatbot, history, prompt_wav, cache_dir, model_path="Step-Audio-2-mi
         print(f"predict text={text!r}")
         # Convert tokens -> waveform bytes using token2wav
-        audio_bytes = token2wav(audio_tokens, prompt_wav)
         # Persist to temp .wav for the UI
         audio_path = save_tmp_audio(audio_bytes, cache_dir)
@@ -132,7 +131,6 @@ def _launch_demo(args):
             type="messages",
         )
-        # Initialize history with current system prompt value
         history = gr.State([{"role": "system", "content": system_prompt.value}])
         mic = gr.Audio(type="filepath", label="🎤 Speak (optional)")
@@ -148,10 +146,9 @@ def _launch_demo(args):
             if error:
                 gr.Warning(error)
                 return chatbot2, history2, None, None
-            # Run GPU inference with only picklable args
             chatbot2, history2 = predict(
                 chatbot2, history2,
-                args.prompt_wav, args.cache_dir,
                 model_path=args.model_path
             )
             return chatbot2, history2, None, None
@@ -174,7 +171,6 @@ def _launch_demo(args):
         )
         def on_regenerate(chatbot_val, history_val):
-            # Drop last assistant turn(s) to regenerate
             while chatbot_val and chatbot_val[-1]["role"] == "assistant":
                 chatbot_val.pop()
             while history_val and history_val[-1]["role"] == "assistant":
@@ -182,7 +178,7 @@ def _launch_demo(args):
                 history_val.pop()
             return predict(
                 chatbot_val, history_val,
-                args.prompt_wav, args.cache_dir,
                 model_path=args.model_path
             )
@@ -205,7 +201,6 @@ if __name__ == "__main__":
     parser.add_argument("--model-path", type=str, default="Step-Audio-2-mini", help="Model path.")
     parser.add_argument("--server-port", type=int, default=7860, help="Demo server port.")
     parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Demo server name.")
-    parser.add_argument("--prompt-wav", type=str, default="assets/default_female.wav", help="Prompt wave for the assistant.")
     parser.add_argument("--cache-dir", type=str, default="/tmp/stepaudio2", help="Cache directory.")
     args = parser.parse_args()

     """
     global _MODEL, _TOK2WAV
     if _MODEL is None or _TOK2WAV is None:
         from stepaudio2 import StepAudio2
         from token2wav import Token2wav
         _MODEL = StepAudio2(model_path)
     return _MODEL, _TOK2WAV
 @spaces.GPU
+def predict(chatbot, history, cache_dir, model_path="Step-Audio-2-mini"):
     """
     Run generation on GPU worker. All args must be picklable (strings, lists, dicts).
     Heavy models are created via _get_models() inside this process.
         print(f"predict text={text!r}")
         # Convert tokens -> waveform bytes using token2wav
+        audio_bytes = token2wav(audio_tokens)
         # Persist to temp .wav for the UI
         audio_path = save_tmp_audio(audio_bytes, cache_dir)
             type="messages",
         )
         history = gr.State([{"role": "system", "content": system_prompt.value}])
         mic = gr.Audio(type="filepath", label="🎤 Speak (optional)")
             if error:
                 gr.Warning(error)
                 return chatbot2, history2, None, None
             chatbot2, history2 = predict(
                 chatbot2, history2,
+                args.cache_dir,
                 model_path=args.model_path
             )
             return chatbot2, history2, None, None
         )
         def on_regenerate(chatbot_val, history_val):
             while chatbot_val and chatbot_val[-1]["role"] == "assistant":
                 chatbot_val.pop()
             while history_val and history_val[-1]["role"] == "assistant":
                 history_val.pop()
             return predict(
                 chatbot_val, history_val,
+                args.cache_dir,
                 model_path=args.model_path
             )
     parser.add_argument("--model-path", type=str, default="Step-Audio-2-mini", help="Model path.")
     parser.add_argument("--server-port", type=int, default=7860, help="Demo server port.")
     parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Demo server name.")
     parser.add_argument("--cache-dir", type=str, default="/tmp/stepaudio2", help="Cache directory.")
     args = parser.parse_args()