Spaces:

OnyxMunk
/

DAW_Sampler_Invader

Runtime error

App Files Files Community

Keith commited on Mar 8

Commit

e696c96

1 Parent(s): ab80cc2

Switch default to audioldm2-music and add model selector

Browse files

Files changed (2) hide show

app.py +37 -21
src/text_to_audio/pipeline.py +4 -0

app.py CHANGED Viewed

@@ -17,14 +17,13 @@ from fastapi import BackgroundTasks, FastAPI
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
-from src.text_to_audio import build_pipeline
-# Initialize Pipeline (defaulting to musicgen-small for MusicSampler)
-MODEL_PRESET = os.getenv("MODEL_PRESET", "musicgen-small")
 USE_4BIT = os.getenv("USE_4BIT", "False").lower() == "true"
 print(f"Loading {MODEL_PRESET} (4-bit={USE_4BIT})...")
-# Force device to cuda if available, otherwise cpu
 device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = build_pipeline(preset=MODEL_PRESET, use_4bit=USE_4BIT, device_map=device)
@@ -33,16 +32,29 @@ class GenRequest(BaseModel):
     duration: float = 5.0
     model: str = MODEL_PRESET
-# Gradio Interface functions
-def gradio_gen(prompt, duration):
     if not prompt or not prompt.strip():
         return None, "Please enter a prompt."
-    # MusicGen: 5 seconds ~ 250 tokens (50 tokens/sec approx)
-    tokens = int(duration * 50)
     out, profile = pipe.generate_with_profile(
         prompt,
-        generate_kwargs={"max_new_tokens": tokens}
     )
     single = out if isinstance(out, dict) else out[0]
     audio = single["audio"]
@@ -54,7 +66,6 @@ def gradio_gen(prompt, duration):
         arr = np.asarray(audio)
     path = f"/tmp/gradio_{uuid.uuid4()}.wav"
-    # Ensure audio is properly formatted for soundfile
     sf.write(path, arr.T if arr.ndim == 2 else arr, sr)
     return path, f"Generated in {profile.get('time_s', 0):.2f}s (RTF: {profile.get('rtf', 0):.2f})"
@@ -64,17 +75,21 @@ with gr.Blocks(title="MusicSampler", theme=gr.themes.Monochrome()) as ui:
     with gr.Row():
         with gr.Column():
-            prompt = gr.Textbox(label="Musical Prompt", placeholder="Lo-fi hip hop beat with smooth rhodes piano...", lines=3)
-            duration = gr.Slider(minimum=1, maximum=30, value=5, step=1, label="Duration (seconds)")
             btn = gr.Button("Sample", variant="primary")
         with gr.Column():
             audio_out = gr.Audio(label="Output Sample", type="filepath")
             stats = gr.Label(label="Performance")
-    btn.click(gradio_gen, inputs=[prompt, duration], outputs=[audio_out, stats])
-# HF Spaces automatically launches the app defined in app_file if it's sdk: gradio
-# To expose a custom API alongside Gradio, we use the internal FastAPI app.
 app = ui.app
 @app.post("/generate")
@@ -83,10 +98,15 @@ async def api_generate(req: GenRequest, background_tasks: BackgroundTasks):
     filename = f"gen_{uuid.uuid4()}.wav"
     output_path = os.path.join("/tmp", filename)
-    tokens = int(req.duration * 50)
     out = pipe.generate(
         req.prompt,
-        generate_kwargs={"max_new_tokens": tokens}
     )
     single = out if isinstance(out, dict) else out[0]
@@ -99,12 +119,8 @@ async def api_generate(req: GenRequest, background_tasks: BackgroundTasks):
         arr = np.asarray(audio)
     sf.write(output_path, arr.T if arr.ndim == 2 else arr, sr)
-    # Clean up file after serving
     background_tasks.add_task(os.remove, output_path)
     return FileResponse(output_path, media_type="audio/wav", filename=filename)
-# Standard entry point for HF Spaces
 if __name__ == "__main__":
     ui.launch(server_name="0.0.0.0", server_port=7860)

 from fastapi.responses import FileResponse
 from pydantic import BaseModel
+from src.text_to_audio import build_pipeline, list_presets
+# Defaults to audioldm2-music as a robust alternative to MusicGen
+MODEL_PRESET = os.getenv("MODEL_PRESET", "audioldm2-music")
 USE_4BIT = os.getenv("USE_4BIT", "False").lower() == "true"
 print(f"Loading {MODEL_PRESET} (4-bit={USE_4BIT})...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = build_pipeline(preset=MODEL_PRESET, use_4bit=USE_4BIT, device_map=device)
     duration: float = 5.0
     model: str = MODEL_PRESET
+def gradio_gen(prompt, duration, selected_model):
+    global pipe, MODEL_PRESET
     if not prompt or not prompt.strip():
         return None, "Please enter a prompt."
+    # Reload model if preset changed
+    if selected_model != MODEL_PRESET:
+        print(f"Switching to {selected_model}...")
+        pipe = build_pipeline(preset=selected_model, use_4bit=USE_4BIT, device_map=device)
+        MODEL_PRESET = selected_model
+    # Tokens/Steps vary by model;
+    # For MusicGen: ~50 tokens/sec
+    # For AudioLDM: uses num_inference_steps (passed via generate_kwargs)
+    generate_kwargs = {}
+    if "musicgen" in MODEL_PRESET:
+        generate_kwargs["max_new_tokens"] = int(duration * 50)
+    elif "audioldm" in MODEL_PRESET:
+        generate_kwargs["num_inference_steps"] = 25 # Default good quality
     out, profile = pipe.generate_with_profile(
         prompt,
+        generate_kwargs=generate_kwargs
     )
     single = out if isinstance(out, dict) else out[0]
     audio = single["audio"]
         arr = np.asarray(audio)
     path = f"/tmp/gradio_{uuid.uuid4()}.wav"
     sf.write(path, arr.T if arr.ndim == 2 else arr, sr)
     return path, f"Generated in {profile.get('time_s', 0):.2f}s (RTF: {profile.get('rtf', 0):.2f})"
     with gr.Row():
         with gr.Column():
+            prompt = gr.Textbox(label="Musical/Audio Prompt", placeholder="An ambient synth pad with a slow filter sweep...", lines=3)
+            with gr.Row():
+                duration = gr.Slider(minimum=1, maximum=30, value=5, step=1, label="Duration (seconds)")
+                preset_choice = gr.Dropdown(
+                    choices=list(list_presets().keys()),
+                    value=MODEL_PRESET,
+                    label="Model Preset"
+                )
             btn = gr.Button("Sample", variant="primary")
         with gr.Column():
             audio_out = gr.Audio(label="Output Sample", type="filepath")
             stats = gr.Label(label="Performance")
+    btn.click(gradio_gen, inputs=[prompt, duration, preset_choice], outputs=[audio_out, stats])
 app = ui.app
 @app.post("/generate")
     filename = f"gen_{uuid.uuid4()}.wav"
     output_path = os.path.join("/tmp", filename)
+    generate_kwargs = {}
+    if "musicgen" in req.model:
+        generate_kwargs["max_new_tokens"] = int(req.duration * 50)
+    elif "audioldm" in req.model:
+        generate_kwargs["num_inference_steps"] = 25
     out = pipe.generate(
         req.prompt,
+        generate_kwargs=generate_kwargs
     )
     single = out if isinstance(out, dict) else out[0]
         arr = np.asarray(audio)
     sf.write(output_path, arr.T if arr.ndim == 2 else arr, sr)
     background_tasks.add_task(os.remove, output_path)
     return FileResponse(output_path, media_type="audio/wav", filename=filename)
 if __name__ == "__main__":
     ui.launch(server_name="0.0.0.0", server_port=7860)

src/text_to_audio/pipeline.py CHANGED Viewed

@@ -41,6 +41,10 @@ PRESETS = {
         "model_id": "facebook/musicgen-small",
         "description": "Music/sfx; 32k Hz, generation-style.",
     },
 }

         "model_id": "facebook/musicgen-small",
         "description": "Music/sfx; 32k Hz, generation-style.",
     },
+    "audioldm2-music": {
+        "model_id": "cvssp/audioldm2-music",
+        "description": "High-quality music generation via AudioLDM2; robust alternative to MusicGen.",
+    },
 }