manoskary commited on
Commit
3cb2cee
·
verified ·
1 Parent(s): fb5663b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -49
app.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import torchaudio
3
  from einops import rearrange
@@ -7,11 +21,40 @@ import os
7
  import uuid
8
 
9
  # Importing the model-related functions
10
- from stable_audio_tools import get_pretrained_model
11
  from stable_audio_tools.inference.generation import generate_diffusion_cond
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Load the model outside of the GPU-decorated function
 
 
14
  def load_model():
 
 
 
 
15
  print("Loading model...")
16
  model, model_config = get_pretrained_model("santifiorino/SAO-Instrumental-Finetune")
17
  print("Model loaded successfully.")
@@ -20,6 +63,19 @@ def load_model():
20
  # Function to set up, generate, and process the audio
21
  @spaces.GPU(duration=120) # Allocate GPU only when this function is called
22
  def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  print(f"Prompt received: {prompt}")
24
  print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
25
 
@@ -28,7 +84,7 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
28
 
29
  # Fetch the Hugging Face token from the environment variable
30
  hf_token = os.getenv('HF_TOKEN')
31
- print(f"Hugging Face token: {hf_token}")
32
 
33
  # Use pre-loaded model and configuration
34
  model, model_config = load_model()
@@ -82,65 +138,130 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
82
  # Return the path to the generated audio file
83
  return unique_filename
84
 
 
85
  # Setting up the Gradio Interface
86
  interface = gr.Interface(
87
  fn=generate_audio,
88
  inputs=[
89
- gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
 
 
 
 
90
  gr.Slider(0, 47, value=30, label="Duration in Seconds"),
91
  gr.Slider(10, 150, value=100, step=10, label="Number of Diffusion Steps"),
92
  gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
93
  ],
94
- outputs=gr.Audio(type="filepath", label="Generated Audio"),
95
- title="Stable Audio Generator",
96
- description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.",
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  examples=[
98
- [
99
- "Create a serene soundscape of a quiet beach at sunset.", # Text prompt
100
-
101
- 45, # Duration in Seconds
102
- 100, # Number of Diffusion Steps
103
- 10, # CFG Scale
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  ],
105
- [
106
- "Generate an energetic and bustling city street scene with distant traffic and close conversations.", # Text prompt
107
-
108
- 30, # Duration in Seconds
109
- 120, # Number of Diffusion Steps
110
- 5, # CFG Scale
111
- ],
112
- [
113
- "Simulate a forest ambiance with birds chirping and wind rustling through the leaves.", # Text prompt
114
- 60, # Duration in Seconds
115
- 140, # Number of Diffusion Steps
116
- 7.5, # CFG Scale
117
- ],
118
- [
119
- "Recreate a gentle rainfall with distant thunder.", # Text prompt
120
-
121
- 35, # Duration in Seconds
122
- 110, # Number of Diffusion Steps
123
- 8, # CFG Scale
124
-
125
- ],
126
- [
127
- "Imagine a jazz cafe environment with soft music and ambient chatter.", # Text prompt
128
- 25, # Duration in Seconds
129
- 90, # Number of Diffusion Steps
130
- 6, # CFG Scale
131
-
132
- ],
133
- ["Rock beat played in a treated studio, session drumming on an acoustic kit.",
134
- 30, # Duration in Seconds
135
- 100, # Number of Diffusion Steps
136
- 7, # CFG Scale
137
-
138
- ]
139
- ])
140
-
141
 
142
  # Pre-load the model to avoid multiprocessing issues
143
  model, model_config = load_model()
144
 
145
  # Launch the Interface
146
- interface.launch()
 
 
1
+ """
2
+ Stable Audio Open Gradio Inference App for HuggingFace Spaces
3
+
4
+ This app provides a simple interface for generating high-quality instrumental music
5
+ using Stable Audio Open with the SAO-Instrumental-Finetune model.
6
+
7
+ Designed to be used as a remote computation tool for WeaveMuse.
8
+
9
+ Architecture:
10
+ - Stable Audio model is loaded OUTSIDE the GPU-decorated function
11
+ - Only the inference itself runs on GPU (cost-efficient for HF Spaces Zero GPU)
12
+ - Model initialization happens once at startup
13
+ """
14
+
15
  import torch
16
  import torchaudio
17
  from einops import rearrange
 
21
  import uuid
22
 
23
  # Importing the model-related functions
 
24
  from stable_audio_tools.inference.generation import generate_diffusion_cond
25
+ import json
26
+ from stable_audio_tools.models.factory import create_model_from_config
27
+ from stable_audio_tools.models.utils import load_ckpt_state_dict
28
+
29
+ from huggingface_hub import hf_hub_download
30
+
31
+
32
+ def get_pretrained_model(name="santifiorino/SAO-Instrumental-Finetune"):
33
+
34
+ model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model')
35
+
36
+ with open(model_config_path) as f:
37
+ model_config = json.load(f)
38
+
39
+ model = create_model_from_config(model_config)
40
+
41
+ # Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file
42
+ try:
43
+ model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model')
44
+ except Exception as e:
45
+ model_ckpt_path = hf_hub_download(name, filename="SAO_Instrumental_Finetune.ckpt", repo_type='model')
46
+ model.load_state_dict(load_ckpt_state_dict(model_ckpt_path))
47
+
48
+ return model, model_config
49
 
50
  # Load the model outside of the GPU-decorated function
51
+
52
+
53
  def load_model():
54
+ """
55
+ Load the Stable Audio model outside GPU function.
56
+ This is called once at startup to download and cache the model.
57
+ """
58
  print("Loading model...")
59
  model, model_config = get_pretrained_model("santifiorino/SAO-Instrumental-Finetune")
60
  print("Model loaded successfully.")
 
63
  # Function to set up, generate, and process the audio
64
  @spaces.GPU(duration=120) # Allocate GPU only when this function is called
65
  def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
66
+ """
67
+ Generate instrumental music using Stable Audio.
68
+ This function runs on GPU via @spaces.GPU decorator.
69
+
70
+ Args:
71
+ prompt: Text description of the music to generate
72
+ seconds_total: Duration in seconds (max 47)
73
+ steps: Number of diffusion steps (10-150)
74
+ cfg_scale: Classifier-free guidance scale (1.0-15.0)
75
+
76
+ Returns:
77
+ Path to generated audio file
78
+ """
79
  print(f"Prompt received: {prompt}")
80
  print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
81
 
 
84
 
85
  # Fetch the Hugging Face token from the environment variable
86
  hf_token = os.getenv('HF_TOKEN')
87
+ print(f"Hugging Face token: {'set' if hf_token else 'not set'}")
88
 
89
  # Use pre-loaded model and configuration
90
  model, model_config = load_model()
 
138
  # Return the path to the generated audio file
139
  return unique_filename
140
 
141
+
142
  # Setting up the Gradio Interface
143
  interface = gr.Interface(
144
  fn=generate_audio,
145
  inputs=[
146
+ gr.Textbox(
147
+ label="Prompt",
148
+ placeholder="Describe the instrumental music you want to generate...",
149
+ value="Upbeat rock guitar with drums and bass"
150
+ ),
151
  gr.Slider(0, 47, value=30, label="Duration in Seconds"),
152
  gr.Slider(10, 150, value=100, step=10, label="Number of Diffusion Steps"),
153
  gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
154
  ],
155
+ outputs=gr.Audio(type="filepath", label="Generated Music"),
156
+ title="🎸 Stable Audio Instrumental Generator",
157
+ description="""
158
+ Generate high-quality instrumental music at 44.1kHz from text prompts using the SAO-Instrumental-Finetune model.
159
+
160
+ **Features:**
161
+ - 🎹 Piano, guitar, drums, bass, and orchestral instruments
162
+ - 🎵 Various musical genres and styles
163
+ - ⚡ High-quality stereo audio
164
+ - 🎼 Perfect for music composition and production
165
+
166
+ **Tips:**
167
+ - Be specific about instruments, tempo, and mood
168
+ - Higher steps = better quality (recommended: 100-120)
169
+ - CFG Scale 7-10 works well for most prompts
170
+ """,
171
  examples=[
172
+ [
173
+ "Energetic rock guitar riff with powerful drums and bass",
174
+ 30,
175
+ 100,
176
+ 7,
177
+ ],
178
+ [
179
+ "Smooth jazz piano trio with upright bass and brushed drums",
180
+ 35,
181
+ 110,
182
+ 8,
183
+ ],
184
+ [
185
+ "Epic orchestral strings and brass with cinematic percussion",
186
+ 45,
187
+ 120,
188
+ 10,
189
+ ],
190
+ [
191
+ "Funky electric bass groove with rhythm guitar and tight drums",
192
+ 30,
193
+ 100,
194
+ 7,
195
+ ],
196
+ [
197
+ "Acoustic guitar fingerpicking with soft percussion",
198
+ 40,
199
+ 110,
200
+ 6,
201
+ ],
202
+ [
203
+ "Electronic synthesizer pads with ambient textures and subtle beats",
204
+ 35,
205
+ 100,
206
+ 7.5,
207
+ ],
208
+ [
209
+ "Classical piano solo with expressive dynamics and sustain pedal",
210
+ 30,
211
+ 110,
212
+ 8,
213
+ ],
214
+ [
215
+ "Blues guitar solo with bending notes over a shuffle rhythm section",
216
+ 30,
217
+ 100,
218
+ 7,
219
+ ],
220
+ [
221
+ "Latin percussion ensemble with congas, bongos, and timbales",
222
+ 30,
223
+ 100,
224
+ 7,
225
+ ],
226
+ [
227
+ "Rock beat played in a treated studio, session drumming on an acoustic kit",
228
+ 30,
229
+ 100,
230
+ 7,
231
+ ]
232
  ],
233
+ article="""
234
+ ---
235
+ ### About SAO-Instrumental-Finetune
236
+
237
+ This model is a fine-tuned version of **Stable Audio Open 1.0** specifically trained for instrumental music generation.
238
+
239
+ **Capabilities:**
240
+ - 🎸 **Guitar**: Acoustic, electric, classical, jazz, rock
241
+ - 🥁 **Drums**: Rock, jazz, electronic, orchestral percussion
242
+ - 🎹 **Piano**: Classical, jazz, modern, ambient
243
+ - **Orchestral**: Strings, brass, woodwinds
244
+ - � **Other**: Bass, synthesizers, ethnic instruments
245
+
246
+ **Technical Details:**
247
+ - Model: SAO-Instrumental-Finetune (based on Stable Audio Open 1.0)
248
+ - Sample Rate: 44.1kHz (CD quality)
249
+ - Max Duration: 47 seconds
250
+ - Architecture: Latent diffusion model with conditioning
251
+
252
+ **Integration:**
253
+ This space is designed to work with **WeaveMuse** for AI-assisted music composition.
254
+ Use the API endpoint for programmatic access in your music production workflows.
255
+
256
+ ---
257
+
258
+ *Powered by [Stability AI](https://stability.ai/) and [WeaveMuse](https://github.com/manoskary/weavemuse)*
259
+ """
260
+ )
 
 
 
 
 
 
 
 
261
 
262
  # Pre-load the model to avoid multiprocessing issues
263
  model, model_config = load_model()
264
 
265
  # Launch the Interface
266
+ if __name__ == "__main__":
267
+ interface.launch()