hugofloresgarcia commited on
Commit
1998a68
·
1 Parent(s): a3ee852

Initial commit: Add Stable Audio Open Small app with 4 variations

Browse files
Files changed (3) hide show
  1. README.md +69 -6
  2. app.py +148 -0
  3. requirements.txt +33 -0
README.md CHANGED
@@ -1,12 +1,75 @@
1
  ---
2
- title: Saos
3
- emoji: 📊
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Stable Audio Open Small - 4 Variations
3
+ emoji: 🎵
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.20.0
8
  app_file: app.py
9
  pinned: false
10
+ license: stability-ai-community
11
  ---
12
 
13
+ # Stable Audio Open Small - 4 Variations
14
+
15
+ Generate up to 4 audio variations from a single text prompt using Stability AI's Stable Audio Open Small model.
16
+
17
+ ## Model Information
18
+
19
+ **Model**: [stabilityai/stable-audio-open-small](https://huggingface.co/stabilityai/stable-audio-open-small)
20
+
21
+ - **Type**: Latent diffusion model (DiT) with autoencoder
22
+ - **Sample Rate**: 44.1 kHz
23
+ - **Format**: Stereo audio
24
+ - **Max Duration**: 11 seconds
25
+ - **License**: Stability AI Community License
26
+
27
+ ## Features
28
+
29
+ - **4 Variations**: Generate 4 different audio variations from a single prompt
30
+ - **Text-to-Audio**: Simple text prompt interface
31
+ - **Variable Duration**: Control audio length (1-11 seconds)
32
+ - **Fast Generation**: Uses optimized pingpong sampler with 8 steps
33
+
34
+ ## Usage
35
+
36
+ 1. Enter a text prompt describing the audio you want to generate
37
+ 2. Adjust the duration slider (1-11 seconds)
38
+ 3. Click "Generate" to create 4 variations
39
+ 4. Listen to and download your favorite variations
40
+
41
+ ## Example Prompts
42
+
43
+ - "128 BPM tech house drum loop"
44
+ - "Ocean waves crashing on beach"
45
+ - "Jazz piano melody"
46
+ - "Rainforest ambience with bird calls"
47
+ - "Electronic synth pad"
48
+
49
+ ## Model Limitations
50
+
51
+ - The model is not able to generate realistic vocals
52
+ - Trained with English descriptions - may not perform as well in other languages
53
+ - Better at generating sound effects and field recordings than music
54
+ - Performance varies across different music styles and cultures
55
+ - Prompt engineering may be required for best results
56
+
57
+ ## Technical Details
58
+
59
+ - **Steps**: 8 (optimized for speed)
60
+ - **CFG Scale**: 1.0
61
+ - **Sampler**: pingpong
62
+ - **Batch Size**: 4 (for generating variations)
63
+
64
+ ## License
65
+
66
+ This Space uses the Stability AI Community License. For commercial use, please refer to [stability.ai/license](https://stability.ai/license).
67
+
68
+ ## Model Card
69
+
70
+ For more information about the model, training data, and limitations, see the [model card](https://huggingface.co/stabilityai/stable-audio-open-small).
71
+
72
+ ## Research Paper
73
+
74
+ [Stable Audio Open: An Open Generative Audio Model](https://arxiv.org/abs/2505.08175)
75
+
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import gradio as gr
4
+ from stable_audio_tools import get_pretrained_model
5
+ from stable_audio_tools.inference.generation import generate_diffusion_cond
6
+
7
+ # Global model variables
8
+ model = None
9
+ model_config = None
10
+ device = None
11
+
12
+ def load_model():
13
+ """Load the pretrained model on startup"""
14
+ global model, model_config, device
15
+
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ print(f"Loading model on device: {device}")
18
+
19
+ # Download and load the pretrained model
20
+ model, model_config = get_pretrained_model("stabilityai/stable-audio-open-small")
21
+ sample_rate = model_config["sample_rate"]
22
+ sample_size = model_config["sample_size"]
23
+
24
+ model = model.to(device).eval().requires_grad_(False)
25
+ model = model.to(torch.float16) # Use half precision for efficiency
26
+
27
+ print(f"Model loaded successfully. Sample rate: {sample_rate}, Sample size: {sample_size}")
28
+ return model, model_config
29
+
30
+ def generate_audio(prompt, seconds_total=11):
31
+ """Generate 4 audio variations from a text prompt"""
32
+ global model, model_config, device
33
+
34
+ if model is None:
35
+ return [], "Model not loaded. Please wait..."
36
+
37
+ if not prompt or not prompt.strip():
38
+ return [], "Please enter a text prompt."
39
+
40
+ # Set up text and timing conditioning (repeat for batch_size)
41
+ conditioning = [{
42
+ "prompt": prompt,
43
+ "seconds_total": seconds_total
44
+ }] * 4 # Repeat for batch_size=4
45
+
46
+ # Generate 4 variations using batch_size=4
47
+ try:
48
+ output = generate_diffusion_cond(
49
+ model,
50
+ steps=8,
51
+ cfg_scale=1.0,
52
+ conditioning=conditioning,
53
+ sample_size=model_config["sample_size"],
54
+ sampler_type="pingpong",
55
+ device=device,
56
+ batch_size=4 # Generate 4 variations
57
+ )
58
+
59
+ # Rearrange audio batch: [batch, channels, samples] -> [channels, batch*samples]
60
+ # Then split back into individual files
61
+ sample_rate = model_config["sample_rate"]
62
+ audio_files = []
63
+
64
+ # Process each variation in the batch
65
+ for i in range(4):
66
+ # Extract single variation: [channels, samples]
67
+ audio = output[i] # Shape: [channels, samples]
68
+
69
+ # Peak normalize, clip, convert to int16
70
+ audio = audio.to(torch.float32)
71
+ audio_max = torch.max(torch.abs(audio))
72
+ if audio_max > 0:
73
+ audio = audio.div(audio_max)
74
+ audio = audio.clamp(-1, 1).mul(32767).to(torch.int16).cpu()
75
+
76
+ # Save to temporary file
77
+ filename = f"output_variation_{i+1}.wav"
78
+ torchaudio.save(filename, audio, sample_rate)
79
+ audio_files.append(filename)
80
+
81
+ return audio_files, f"Generated 4 variations for: '{prompt}'"
82
+
83
+ except Exception as e:
84
+ import traceback
85
+ error_msg = f"Error generating audio: {str(e)}\n{traceback.format_exc()}"
86
+ print(error_msg)
87
+ return [], error_msg
88
+
89
+ # Load model on startup
90
+ print("Initializing model...")
91
+ load_model()
92
+
93
+ # Create Gradio interface
94
+ with gr.Blocks(title="Stable Audio Open Small - 4 Variations") as demo:
95
+ gr.Markdown("""
96
+ # Stable Audio Open Small
97
+
98
+ Generate up to 4 audio variations from a text prompt.
99
+
100
+ **Model**: [stabilityai/stable-audio-open-small](https://huggingface.co/stabilityai/stable-audio-open-small)
101
+
102
+ Enter a text description and click Generate to create 4 different audio variations.
103
+ """)
104
+
105
+ with gr.Row():
106
+ with gr.Column():
107
+ prompt_input = gr.Textbox(
108
+ label="Text Prompt",
109
+ placeholder="e.g., 128 BPM tech house drum loop",
110
+ lines=2
111
+ )
112
+ seconds_input = gr.Slider(
113
+ minimum=1,
114
+ maximum=11,
115
+ value=11,
116
+ step=1,
117
+ label="Duration (seconds)",
118
+ info="Maximum 11 seconds"
119
+ )
120
+ generate_btn = gr.Button("Generate", variant="primary")
121
+
122
+ with gr.Column():
123
+ status_output = gr.Textbox(label="Status", interactive=False)
124
+ audio_gallery = gr.Gallery(
125
+ label="Generated Audio Variations",
126
+ show_label=True,
127
+ elem_id="gallery",
128
+ columns=2,
129
+ rows=2,
130
+ height="auto"
131
+ )
132
+
133
+ generate_btn.click(
134
+ fn=generate_audio,
135
+ inputs=[prompt_input, seconds_input],
136
+ outputs=[audio_gallery, status_output]
137
+ )
138
+
139
+ gr.Markdown("""
140
+ ### Tips
141
+ - The model works best with English descriptions
142
+ - Better at generating sound effects and field recordings than music
143
+ - Each variation uses a different random seed for diversity
144
+ """)
145
+
146
+ if __name__ == "__main__":
147
+ demo.launch()
148
+
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies for Stable Audio Open Small
2
+ torch>=2.5.1
3
+ torchaudio>=2.5.1
4
+ gradio>=5.20.0
5
+ einops
6
+ einops-exts
7
+ safetensors
8
+ transformers
9
+ huggingface_hub
10
+ sentencepiece==0.1.99
11
+
12
+ # Stable Audio Tools dependencies
13
+ alias-free-torch==0.0.6
14
+ auraloss==0.4.0
15
+ descript-audio-codec==1.0.0
16
+ ema-pytorch==0.2.3
17
+ encodec==0.1.1
18
+ importlib-resources==5.12.0
19
+ k-diffusion==0.1.1
20
+ laion-clap==1.1.4
21
+ local-attention==1.8.6
22
+ pandas==2.0.2
23
+ prefigure==0.0.9
24
+ pytorch_lightning==2.1.0
25
+ PyWavelets==1.4.1
26
+ torchmetrics==0.11.4
27
+ tqdm
28
+ v-diffusion-pytorch==0.0.2
29
+ vector-quantize-pytorch==1.14.41
30
+
31
+ # Install stable-audio-tools from source
32
+ git+https://github.com/Stability-AI/stable-audio-tools.git
33
+