Spaces:

hugggof
/

saos

Running

App Files Files Community

hugofloresgarcia commited on 25 days ago

Commit

1998a68

1 Parent(s): a3ee852

Initial commit: Add Stable Audio Open Small app with 4 variations

Browse files

Files changed (3) hide show

README.md +69 -6
app.py +148 -0
requirements.txt +33 -0

README.md CHANGED Viewed

@@ -1,12 +1,75 @@
 ---
-title: Saos
-emoji: 📊
-colorFrom: red
-colorTo: pink
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Stable Audio Open Small - 4 Variations
+emoji: 🎵
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 5.20.0
 app_file: app.py
 pinned: false
+license: stability-ai-community
 ---
+# Stable Audio Open Small - 4 Variations
+Generate up to 4 audio variations from a single text prompt using Stability AI's Stable Audio Open Small model.
+## Model Information
+**Model**: [stabilityai/stable-audio-open-small](https://huggingface.co/stabilityai/stable-audio-open-small)
+- **Type**: Latent diffusion model (DiT) with autoencoder
+- **Sample Rate**: 44.1 kHz
+- **Format**: Stereo audio
+- **Max Duration**: 11 seconds
+- **License**: Stability AI Community License
+## Features
+- **4 Variations**: Generate 4 different audio variations from a single prompt
+- **Text-to-Audio**: Simple text prompt interface
+- **Variable Duration**: Control audio length (1-11 seconds)
+- **Fast Generation**: Uses optimized pingpong sampler with 8 steps
+## Usage
+1. Enter a text prompt describing the audio you want to generate
+2. Adjust the duration slider (1-11 seconds)
+3. Click "Generate" to create 4 variations
+4. Listen to and download your favorite variations
+## Example Prompts
+- "128 BPM tech house drum loop"
+- "Ocean waves crashing on beach"
+- "Jazz piano melody"
+- "Rainforest ambience with bird calls"
+- "Electronic synth pad"
+## Model Limitations
+- The model is not able to generate realistic vocals
+- Trained with English descriptions - may not perform as well in other languages
+- Better at generating sound effects and field recordings than music
+- Performance varies across different music styles and cultures
+- Prompt engineering may be required for best results
+## Technical Details
+- **Steps**: 8 (optimized for speed)
+- **CFG Scale**: 1.0
+- **Sampler**: pingpong
+- **Batch Size**: 4 (for generating variations)
+## License
+This Space uses the Stability AI Community License. For commercial use, please refer to [stability.ai/license](https://stability.ai/license).
+## Model Card
+For more information about the model, training data, and limitations, see the [model card](https://huggingface.co/stabilityai/stable-audio-open-small).
+## Research Paper
+[Stable Audio Open: An Open Generative Audio Model](https://arxiv.org/abs/2505.08175)

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch
+import torchaudio
+import gradio as gr
+from stable_audio_tools import get_pretrained_model
+from stable_audio_tools.inference.generation import generate_diffusion_cond
+# Global model variables
+model = None
+model_config = None
+device = None
+def load_model():
+    """Load the pretrained model on startup"""
+    global model, model_config, device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Loading model on device: {device}")
+    # Download and load the pretrained model
+    model, model_config = get_pretrained_model("stabilityai/stable-audio-open-small")
+    sample_rate = model_config["sample_rate"]
+    sample_size = model_config["sample_size"]
+    model = model.to(device).eval().requires_grad_(False)
+    model = model.to(torch.float16)  # Use half precision for efficiency
+    print(f"Model loaded successfully. Sample rate: {sample_rate}, Sample size: {sample_size}")
+    return model, model_config
+def generate_audio(prompt, seconds_total=11):
+    """Generate 4 audio variations from a text prompt"""
+    global model, model_config, device
+    if model is None:
+        return [], "Model not loaded. Please wait..."
+    if not prompt or not prompt.strip():
+        return [], "Please enter a text prompt."
+    # Set up text and timing conditioning (repeat for batch_size)
+    conditioning = [{
+        "prompt": prompt,
+        "seconds_total": seconds_total
+    }] * 4  # Repeat for batch_size=4
+    # Generate 4 variations using batch_size=4
+    try:
+        output = generate_diffusion_cond(
+            model,
+            steps=8,
+            cfg_scale=1.0,
+            conditioning=conditioning,
+            sample_size=model_config["sample_size"],
+            sampler_type="pingpong",
+            device=device,
+            batch_size=4  # Generate 4 variations
+        )
+        # Rearrange audio batch: [batch, channels, samples] -> [channels, batch*samples]
+        # Then split back into individual files
+        sample_rate = model_config["sample_rate"]
+        audio_files = []
+        # Process each variation in the batch
+        for i in range(4):
+            # Extract single variation: [channels, samples]
+            audio = output[i]  # Shape: [channels, samples]
+            # Peak normalize, clip, convert to int16
+            audio = audio.to(torch.float32)
+            audio_max = torch.max(torch.abs(audio))
+            if audio_max > 0:
+                audio = audio.div(audio_max)
+            audio = audio.clamp(-1, 1).mul(32767).to(torch.int16).cpu()
+            # Save to temporary file
+            filename = f"output_variation_{i+1}.wav"
+            torchaudio.save(filename, audio, sample_rate)
+            audio_files.append(filename)
+        return audio_files, f"Generated 4 variations for: '{prompt}'"
+    except Exception as e:
+        import traceback
+        error_msg = f"Error generating audio: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return [], error_msg
+# Load model on startup
+print("Initializing model...")
+load_model()
+# Create Gradio interface
+with gr.Blocks(title="Stable Audio Open Small - 4 Variations") as demo:
+    gr.Markdown("""
+    # Stable Audio Open Small
+    Generate up to 4 audio variations from a text prompt.
+    **Model**: [stabilityai/stable-audio-open-small](https://huggingface.co/stabilityai/stable-audio-open-small)
+    Enter a text description and click Generate to create 4 different audio variations.
+    """)
+    with gr.Row():
+        with gr.Column():
+            prompt_input = gr.Textbox(
+                label="Text Prompt",
+                placeholder="e.g., 128 BPM tech house drum loop",
+                lines=2
+            )
+            seconds_input = gr.Slider(
+                minimum=1,
+                maximum=11,
+                value=11,
+                step=1,
+                label="Duration (seconds)",
+                info="Maximum 11 seconds"
+            )
+            generate_btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            status_output = gr.Textbox(label="Status", interactive=False)
+            audio_gallery = gr.Gallery(
+                label="Generated Audio Variations",
+                show_label=True,
+                elem_id="gallery",
+                columns=2,
+                rows=2,
+                height="auto"
+            )
+    generate_btn.click(
+        fn=generate_audio,
+        inputs=[prompt_input, seconds_input],
+        outputs=[audio_gallery, status_output]
+    )
+    gr.Markdown("""
+    ### Tips
+    - The model works best with English descriptions
+    - Better at generating sound effects and field recordings than music
+    - Each variation uses a different random seed for diversity
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+# Core dependencies for Stable Audio Open Small
+torch>=2.5.1
+torchaudio>=2.5.1
+gradio>=5.20.0
+einops
+einops-exts
+safetensors
+transformers
+huggingface_hub
+sentencepiece==0.1.99
+# Stable Audio Tools dependencies
+alias-free-torch==0.0.6
+auraloss==0.4.0
+descript-audio-codec==1.0.0
+ema-pytorch==0.2.3
+encodec==0.1.1
+importlib-resources==5.12.0
+k-diffusion==0.1.1
+laion-clap==1.1.4
+local-attention==1.8.6
+pandas==2.0.2
+prefigure==0.0.9
+pytorch_lightning==2.1.0
+PyWavelets==1.4.1
+torchmetrics==0.11.4
+tqdm
+v-diffusion-pytorch==0.0.2
+vector-quantize-pytorch==1.14.41
+# Install stable-audio-tools from source
+git+https://github.com/Stability-AI/stable-audio-tools.git