OnyxMunk commited on
Commit
e911600
·
1 Parent(s): 505eff0

Implement audio generation functionality with simple synthesis and fallback mechanism; update requirements to remove unused libraries.

Browse files
Files changed (3) hide show
  1. __pycache__/app.cpython-310.pyc +0 -0
  2. app.py +141 -29
  3. requirements.txt +0 -5
__pycache__/app.cpython-310.pyc ADDED
Binary file (4.18 kB). View file
 
app.py CHANGED
@@ -1,12 +1,74 @@
1
  import gradio as gr
2
- import torch
3
  import numpy as np
4
- from transformers import pipeline
5
- import scipy.io.wavfile as wavfile
6
  import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- # Initialize the audio generation pipeline
9
- # Note: This is a placeholder - you'll need to integrate with actual Stable Audio model
10
  def create_audio_generation_interface():
11
  """
12
  Create a Gradio interface for Stable Audio generation
@@ -14,37 +76,79 @@ def create_audio_generation_interface():
14
 
15
  def generate_audio(prompt, duration, seed):
16
  """
17
- Generate audio based on text prompt
18
- This is a placeholder function - replace with actual Stable Audio model
19
  """
20
  try:
21
- # Placeholder implementation
22
- # In a real implementation, you would:
23
- # 1. Load the Stable Audio model
24
- # 2. Process the text prompt
25
- # 3. Generate audio
26
- # 4. Return the audio file
27
-
28
- # For now, return a simple sine wave as placeholder
29
- sample_rate = 44100
30
- duration_samples = int(duration * sample_rate)
31
- frequency = 440 # A4 note
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- t = np.linspace(0, duration, duration_samples, endpoint=False)
34
- audio = 0.5 * np.sin(2 * np.pi * frequency * t)
 
 
 
 
 
 
 
35
 
36
- # Convert to 16-bit PCM
37
- audio_int16 = (audio * 32767).astype(np.int16)
38
 
39
- # Save to bytes buffer
40
- buffer = io.BytesIO()
41
- wavfile.write(buffer, sample_rate, audio_int16)
42
- buffer.seek(0)
 
 
43
 
44
- return (sample_rate, audio)
 
 
 
45
 
46
  except Exception as e:
47
- return f"Error generating audio: {str(e)}"
 
 
 
 
 
 
 
 
 
48
 
49
  # Create the Gradio interface
50
  with gr.Blocks(title="Stable Audio Open", theme=gr.themes.Soft()) as interface:
@@ -84,13 +188,21 @@ def create_audio_generation_interface():
84
  audio_output = gr.Audio(label="Generated Audio")
85
  status_output = gr.Textbox(label="Status", interactive=False)
86
 
87
- # Connect the generate button to the function
88
  generate_btn.click(
89
  fn=generate_audio,
90
  inputs=[prompt_input, duration_input, seed_input],
91
  outputs=[audio_output, status_output]
92
  )
93
 
 
 
 
 
 
 
 
 
94
  # Add some example prompts
95
  gr.Examples(
96
  examples=[
 
1
  import gradio as gr
 
2
  import numpy as np
 
 
3
  import io
4
+ import os
5
+
6
+ # Simple audio synthesis - avoiding heavy ML models for now
7
+ def generate_audio_from_prompt(prompt, duration, seed):
8
+ """
9
+ Generate audio using simple synthesis based on prompt characteristics
10
+ """
11
+ sample_rate = 44100
12
+ duration_samples = int(duration * sample_rate)
13
+
14
+ # Set seed for reproducibility
15
+ if seed is not None:
16
+ np.random.seed(seed)
17
+
18
+ # Extract features from prompt to influence audio
19
+ prompt_lower = prompt.lower()
20
+
21
+ # Base frequency based on prompt content
22
+ base_freq = 220 # A3 note
23
+
24
+ if 'high' in prompt_lower or 'bright' in prompt_lower:
25
+ base_freq *= 2 # Higher octave
26
+ elif 'low' in prompt_lower or 'deep' in prompt_lower:
27
+ base_freq /= 2 # Lower octave
28
+
29
+ if 'fast' in prompt_lower or 'quick' in prompt_lower:
30
+ # Add vibrato for "fast" sounds
31
+ vibrato_freq = 5
32
+ vibrato_depth = 0.1
33
+ else:
34
+ vibrato_freq = 0
35
+ vibrato_depth = 0
36
+
37
+ # Generate time array
38
+ t = np.linspace(0, duration, duration_samples, endpoint=False)
39
+
40
+ # Create base waveform
41
+ if 'noise' in prompt_lower or 'wind' in prompt_lower or 'rain' in prompt_lower:
42
+ # White noise for atmospheric sounds
43
+ audio = np.random.normal(0, 0.3, duration_samples)
44
+ elif 'pulse' in prompt_lower or 'beep' in prompt_lower:
45
+ # Square wave for electronic sounds
46
+ audio = 0.3 * np.sign(np.sin(2 * np.pi * base_freq * t))
47
+ else:
48
+ # Sine wave with optional vibrato
49
+ if vibrato_freq > 0:
50
+ modulated_freq = base_freq * (1 + vibrato_depth * np.sin(2 * np.pi * vibrato_freq * t))
51
+ audio = 0.3 * np.sin(2 * np.pi * np.cumsum(modulated_freq) * (t[1] - t[0]))
52
+ else:
53
+ audio = 0.3 * np.sin(2 * np.pi * base_freq * t)
54
+
55
+ # Add harmonics for richer sound
56
+ if 'rich' in prompt_lower or 'full' in prompt_lower or 'warm' in prompt_lower:
57
+ # Add octave higher harmonic
58
+ harmonic = 0.2 * np.sin(2 * np.pi * (base_freq * 2) * t)
59
+ audio += harmonic
60
+
61
+ # Add some natural variation
62
+ if 'natural' in prompt_lower or 'organic' in prompt_lower:
63
+ # Add slight random variation
64
+ variation = np.random.normal(0, 0.05, duration_samples)
65
+ audio += variation
66
+
67
+ # Normalize to prevent clipping
68
+ audio = np.clip(audio, -0.95, 0.95)
69
+
70
+ return (sample_rate, audio)
71
 
 
 
72
  def create_audio_generation_interface():
73
  """
74
  Create a Gradio interface for Stable Audio generation
 
76
 
77
  def generate_audio(prompt, duration, seed):
78
  """
79
+ Generate audio based on text prompt using Stable Audio model
 
80
  """
81
  try:
82
+ model = load_stable_audio_model()
83
+
84
+ if model == "placeholder":
85
+ # Fallback to placeholder if model loading failed
86
+ sample_rate = 44100
87
+ duration_samples = int(duration * sample_rate)
88
+ frequency = 440 + (seed % 200) # Vary frequency based on seed
89
+
90
+ t = np.linspace(0, duration, duration_samples, endpoint=False)
91
+ audio = 0.3 * np.sin(2 * np.pi * frequency * t)
92
+ return (sample_rate, audio), "Using placeholder audio (model loading failed)"
93
+
94
+ # Set seed for reproducibility
95
+ if seed is not None:
96
+ torch.manual_seed(seed)
97
+ if torch.cuda.is_available():
98
+ torch.cuda.manual_seed(seed)
99
+
100
+ # Generate audio with Stable Audio
101
+ print(f"Generating audio for prompt: '{prompt}', duration: {duration}s")
102
+
103
+ # Create negative prompt for better quality
104
+ negative_prompt = "low quality, distorted, noisy, artifacts"
105
+
106
+ try:
107
+ # Generate the audio with optimized parameters
108
+ audio_output = model(
109
+ prompt=prompt,
110
+ negative_prompt=negative_prompt,
111
+ duration=duration,
112
+ num_inference_steps=50, # Reduced for faster generation
113
+ guidance_scale=3.0, # Reduced for stability
114
+ num_waveforms_per_prompt=1,
115
+ )
116
 
117
+ # Extract the audio data
118
+ audio = audio_output.audios[0] # Shape: [channels, samples]
119
+
120
+ # Convert to mono if stereo
121
+ if audio.ndim > 1:
122
+ audio = audio.mean(axis=0)
123
+
124
+ # Ensure proper sample rate (Stable Audio uses 44100 Hz)
125
+ sample_rate = 44100
126
 
127
+ return (sample_rate, audio), "Audio generated successfully with Stable Audio!"
 
128
 
129
+ except Exception as gen_error:
130
+ print(f"Audio generation failed: {gen_error}")
131
+ # Fallback to simple synthesis
132
+ sample_rate = 44100
133
+ duration_samples = int(duration * sample_rate)
134
+ frequency = 440 + (hash(prompt) % 200) # Vary based on prompt
135
 
136
+ t = np.linspace(0, duration, duration_samples, endpoint=False)
137
+ audio = 0.3 * np.sin(2 * np.pi * frequency * t)
138
+
139
+ return (sample_rate, audio), f"Model generation failed, using fallback synthesis"
140
 
141
  except Exception as e:
142
+ print(f"Error generating audio: {e}")
143
+ # Fallback to simple tone
144
+ sample_rate = 44100
145
+ duration_samples = int(duration * sample_rate)
146
+ frequency = 220 # A3 note
147
+
148
+ t = np.linspace(0, duration, duration_samples, endpoint=False)
149
+ audio = 0.3 * np.sin(2 * np.pi * frequency * t)
150
+
151
+ return (sample_rate, audio), f"Error: {str(e)}. Using fallback audio."
152
 
153
  # Create the Gradio interface
154
  with gr.Blocks(title="Stable Audio Open", theme=gr.themes.Soft()) as interface:
 
188
  audio_output = gr.Audio(label="Generated Audio")
189
  status_output = gr.Textbox(label="Status", interactive=False)
190
 
191
+ # Connect the generate button to the function
192
  generate_btn.click(
193
  fn=generate_audio,
194
  inputs=[prompt_input, duration_input, seed_input],
195
  outputs=[audio_output, status_output]
196
  )
197
 
198
+ # Add loading state
199
+ generate_btn.click(
200
+ fn=lambda: "🎵 Generating audio... Please wait.",
201
+ inputs=[],
202
+ outputs=[status_output],
203
+ queue=False
204
+ )
205
+
206
  # Add some example prompts
207
  gr.Examples(
208
  examples=[
requirements.txt CHANGED
@@ -1,7 +1,2 @@
1
- gradio>=4.0.0
2
- torch>=2.0.0
3
- transformers>=4.30.0
4
  numpy>=1.21.0
5
  scipy>=1.7.0
6
- accelerate>=0.20.0
7
- diffusers>=0.20.0
 
 
 
 
1
  numpy>=1.21.0
2
  scipy>=1.7.0