stable-audio-open-1.0-music

Paused

App Files Files Community

manoskary commited on Oct 3, 2025

Commit

3cb2cee

verified ·

1 Parent(s): fb5663b

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -49

app.py CHANGED Viewed

@@ -1,3 +1,17 @@
 import torch
 import torchaudio
 from einops import rearrange
@@ -7,11 +21,40 @@ import os
 import uuid
 # Importing the model-related functions
-from stable_audio_tools import get_pretrained_model
 from stable_audio_tools.inference.generation import generate_diffusion_cond
 # Load the model outside of the GPU-decorated function
 def load_model():
     print("Loading model...")
     model, model_config = get_pretrained_model("santifiorino/SAO-Instrumental-Finetune")
     print("Model loaded successfully.")
@@ -20,6 +63,19 @@ def load_model():
 # Function to set up, generate, and process the audio
 @spaces.GPU(duration=120)  # Allocate GPU only when this function is called
 def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
     print(f"Prompt received: {prompt}")
     print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
@@ -28,7 +84,7 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
     # Fetch the Hugging Face token from the environment variable
     hf_token = os.getenv('HF_TOKEN')
-    print(f"Hugging Face token: {hf_token}")
     # Use pre-loaded model and configuration
     model, model_config = load_model()
@@ -82,65 +138,130 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
     # Return the path to the generated audio file
     return unique_filename
 # Setting up the Gradio Interface
 interface = gr.Interface(
     fn=generate_audio,
     inputs=[
-        gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
         gr.Slider(0, 47, value=30, label="Duration in Seconds"),
         gr.Slider(10, 150, value=100, step=10, label="Number of Diffusion Steps"),
         gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
     ],
-    outputs=gr.Audio(type="filepath", label="Generated Audio"),
-    title="Stable Audio Generator",
-    description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.",
     examples=[
-    [
-        "Create a serene soundscape of a quiet beach at sunset.",  # Text prompt
-        45,  # Duration in Seconds
-        100,  # Number of Diffusion Steps
-        10,  # CFG Scale
     ],
-    [
-        "Generate an energetic and bustling city street scene with distant traffic and close conversations.",  # Text prompt
-        30,  # Duration in Seconds
-        120,  # Number of Diffusion Steps
-        5,  # CFG Scale
-    ],
-    [
-        "Simulate a forest ambiance with birds chirping and wind rustling through the leaves.",  # Text prompt
-        60,  # Duration in Seconds
-        140,  # Number of Diffusion Steps
-        7.5,  # CFG Scale
-    ],
-    [
-        "Recreate a gentle rainfall with distant thunder.",  # Text prompt
-        35,  # Duration in Seconds
-        110,  # Number of Diffusion Steps
-        8,  # CFG Scale
-    ],
-    [
-        "Imagine a jazz cafe environment with soft music and ambient chatter.",  # Text prompt
-        25,  # Duration in Seconds
-        90,  # Number of Diffusion Steps
-        6,  # CFG Scale
-    ],
-    ["Rock beat played in a treated studio, session drumming on an acoustic kit.",
-        30,  # Duration in Seconds
-        100,  # Number of Diffusion Steps
-        7,  # CFG Scale
-    ]
-])
 # Pre-load the model to avoid multiprocessing issues
 model, model_config = load_model()
 # Launch the Interface
-interface.launch()

+"""
+Stable Audio Open Gradio Inference App for HuggingFace Spaces
+This app provides a simple interface for generating high-quality instrumental music
+using Stable Audio Open with the SAO-Instrumental-Finetune model.
+Designed to be used as a remote computation tool for WeaveMuse.
+Architecture:
+- Stable Audio model is loaded OUTSIDE the GPU-decorated function
+- Only the inference itself runs on GPU (cost-efficient for HF Spaces Zero GPU)
+- Model initialization happens once at startup
+"""
 import torch
 import torchaudio
 from einops import rearrange
 import uuid
 # Importing the model-related functions
 from stable_audio_tools.inference.generation import generate_diffusion_cond
+import json
+from stable_audio_tools.models.factory import create_model_from_config
+from stable_audio_tools.models.utils import load_ckpt_state_dict
+from huggingface_hub import hf_hub_download
+def get_pretrained_model(name="santifiorino/SAO-Instrumental-Finetune"):
+    model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model')
+    with open(model_config_path) as f:
+        model_config = json.load(f)
+    model = create_model_from_config(model_config)
+    # Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file
+    try:
+        model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model')
+    except Exception as e:
+        model_ckpt_path = hf_hub_download(name, filename="SAO_Instrumental_Finetune.ckpt", repo_type='model')
+    model.load_state_dict(load_ckpt_state_dict(model_ckpt_path))
+    return model, model_config
 # Load the model outside of the GPU-decorated function
 def load_model():
+    """
+    Load the Stable Audio model outside GPU function.
+    This is called once at startup to download and cache the model.
+    """
     print("Loading model...")
     model, model_config = get_pretrained_model("santifiorino/SAO-Instrumental-Finetune")
     print("Model loaded successfully.")
 # Function to set up, generate, and process the audio
 @spaces.GPU(duration=120)  # Allocate GPU only when this function is called
 def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
+    """
+    Generate instrumental music using Stable Audio.
+    This function runs on GPU via @spaces.GPU decorator.
+    Args:
+        prompt: Text description of the music to generate
+        seconds_total: Duration in seconds (max 47)
+        steps: Number of diffusion steps (10-150)
+        cfg_scale: Classifier-free guidance scale (1.0-15.0)
+    Returns:
+        Path to generated audio file
+    """
     print(f"Prompt received: {prompt}")
     print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
     # Fetch the Hugging Face token from the environment variable
     hf_token = os.getenv('HF_TOKEN')
+    print(f"Hugging Face token: {'set' if hf_token else 'not set'}")
     # Use pre-loaded model and configuration
     model, model_config = load_model()
     # Return the path to the generated audio file
     return unique_filename
 # Setting up the Gradio Interface
 interface = gr.Interface(
     fn=generate_audio,
     inputs=[
+        gr.Textbox(
+            label="Prompt",
+            placeholder="Describe the instrumental music you want to generate...",
+            value="Upbeat rock guitar with drums and bass"
+        ),
         gr.Slider(0, 47, value=30, label="Duration in Seconds"),
         gr.Slider(10, 150, value=100, step=10, label="Number of Diffusion Steps"),
         gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
     ],
+    outputs=gr.Audio(type="filepath", label="Generated Music"),
+    title="🎸 Stable Audio Instrumental Generator",
+    description="""
+    Generate high-quality instrumental music at 44.1kHz from text prompts using the SAO-Instrumental-Finetune model.
+    **Features:**
+    - 🎹 Piano, guitar, drums, bass, and orchestral instruments
+    - 🎵 Various musical genres and styles
+    - ⚡ High-quality stereo audio
+    - 🎼 Perfect for music composition and production
+    **Tips:**
+    - Be specific about instruments, tempo, and mood
+    - Higher steps = better quality (recommended: 100-120)
+    - CFG Scale 7-10 works well for most prompts
+    """,
     examples=[
+        [
+            "Energetic rock guitar riff with powerful drums and bass",
+            30,
+            100,
+            7,
+        ],
+        [
+            "Smooth jazz piano trio with upright bass and brushed drums",
+            35,
+            110,
+            8,
+        ],
+        [
+            "Epic orchestral strings and brass with cinematic percussion",
+            45,
+            120,
+            10,
+        ],
+        [
+            "Funky electric bass groove with rhythm guitar and tight drums",
+            30,
+            100,
+            7,
+        ],
+        [
+            "Acoustic guitar fingerpicking with soft percussion",
+            40,
+            110,
+            6,
+        ],
+        [
+            "Electronic synthesizer pads with ambient textures and subtle beats",
+            35,
+            100,
+            7.5,
+        ],
+        [
+            "Classical piano solo with expressive dynamics and sustain pedal",
+            30,
+            110,
+            8,
+        ],
+        [
+            "Blues guitar solo with bending notes over a shuffle rhythm section",
+            30,
+            100,
+            7,
+        ],
+        [
+            "Latin percussion ensemble with congas, bongos, and timbales",
+            30,
+            100,
+            7,
+        ],
+        [
+            "Rock beat played in a treated studio, session drumming on an acoustic kit",
+            30,
+            100,
+            7,
+        ]
     ],
+    article="""
+    ---
+    ### About SAO-Instrumental-Finetune
+    This model is a fine-tuned version of **Stable Audio Open 1.0** specifically trained for instrumental music generation.
+    **Capabilities:**
+    - 🎸 **Guitar**: Acoustic, electric, classical, jazz, rock
+    - 🥁 **Drums**: Rock, jazz, electronic, orchestral percussion
+    - 🎹 **Piano**: Classical, jazz, modern, ambient
+    - � **Orchestral**: Strings, brass, woodwinds
+    - � **Other**: Bass, synthesizers, ethnic instruments
+    **Technical Details:**
+    - Model: SAO-Instrumental-Finetune (based on Stable Audio Open 1.0)
+    - Sample Rate: 44.1kHz (CD quality)
+    - Max Duration: 47 seconds
+    - Architecture: Latent diffusion model with conditioning
+    **Integration:**
+    This space is designed to work with **WeaveMuse** for AI-assisted music composition.
+    Use the API endpoint for programmatic access in your music production workflows.
+    ---
+    *Powered by [Stability AI](https://stability.ai/) and [WeaveMuse](https://github.com/manoskary/weavemuse)*
+    """
+)
 # Pre-load the model to avoid multiprocessing issues
 model, model_config = load_model()
 # Launch the Interface
+if __name__ == "__main__":
+    interface.launch()