Spaces:

OnyxMunk
/

Stable-Audio-Open

Paused

App Files Files Community

OnyxMunk commited on Dec 22, 2025

Commit

e911600

1 Parent(s): 505eff0

Implement audio generation functionality with simple synthesis and fallback mechanism; update requirements to remove unused libraries.

Browse files

Files changed (3) hide show

__pycache__/app.cpython-310.pyc +0 -0
app.py +141 -29
requirements.txt +0 -5

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (4.18 kB). View file

app.py CHANGED Viewed

@@ -1,12 +1,74 @@
 import gradio as gr
-import torch
 import numpy as np
-from transformers import pipeline
-import scipy.io.wavfile as wavfile
 import io
-# Initialize the audio generation pipeline
-# Note: This is a placeholder - you'll need to integrate with actual Stable Audio model
 def create_audio_generation_interface():
     """
     Create a Gradio interface for Stable Audio generation
@@ -14,37 +76,79 @@ def create_audio_generation_interface():
     def generate_audio(prompt, duration, seed):
         """
-        Generate audio based on text prompt
-        This is a placeholder function - replace with actual Stable Audio model
         """
         try:
-            # Placeholder implementation
-            # In a real implementation, you would:
-            # 1. Load the Stable Audio model
-            # 2. Process the text prompt
-            # 3. Generate audio
-            # 4. Return the audio file
-            # For now, return a simple sine wave as placeholder
-            sample_rate = 44100
-            duration_samples = int(duration * sample_rate)
-            frequency = 440  # A4 note
-            t = np.linspace(0, duration, duration_samples, endpoint=False)
-            audio = 0.5 * np.sin(2 * np.pi * frequency * t)
-            # Convert to 16-bit PCM
-            audio_int16 = (audio * 32767).astype(np.int16)
-            # Save to bytes buffer
-            buffer = io.BytesIO()
-            wavfile.write(buffer, sample_rate, audio_int16)
-            buffer.seek(0)
-            return (sample_rate, audio)
         except Exception as e:
-            return f"Error generating audio: {str(e)}"
     # Create the Gradio interface
     with gr.Blocks(title="Stable Audio Open", theme=gr.themes.Soft()) as interface:
@@ -84,13 +188,21 @@ def create_audio_generation_interface():
                 audio_output = gr.Audio(label="Generated Audio")
                 status_output = gr.Textbox(label="Status", interactive=False)
-        # Connect the generate button to the function
         generate_btn.click(
             fn=generate_audio,
             inputs=[prompt_input, duration_input, seed_input],
             outputs=[audio_output, status_output]
         )
         # Add some example prompts
         gr.Examples(
             examples=[

 import gradio as gr
 import numpy as np
 import io
+import os
+# Simple audio synthesis - avoiding heavy ML models for now
+def generate_audio_from_prompt(prompt, duration, seed):
+    """
+    Generate audio using simple synthesis based on prompt characteristics
+    """
+    sample_rate = 44100
+    duration_samples = int(duration * sample_rate)
+    # Set seed for reproducibility
+    if seed is not None:
+        np.random.seed(seed)
+    # Extract features from prompt to influence audio
+    prompt_lower = prompt.lower()
+    # Base frequency based on prompt content
+    base_freq = 220  # A3 note
+    if 'high' in prompt_lower or 'bright' in prompt_lower:
+        base_freq *= 2  # Higher octave
+    elif 'low' in prompt_lower or 'deep' in prompt_lower:
+        base_freq /= 2  # Lower octave
+    if 'fast' in prompt_lower or 'quick' in prompt_lower:
+        # Add vibrato for "fast" sounds
+        vibrato_freq = 5
+        vibrato_depth = 0.1
+    else:
+        vibrato_freq = 0
+        vibrato_depth = 0
+    # Generate time array
+    t = np.linspace(0, duration, duration_samples, endpoint=False)
+    # Create base waveform
+    if 'noise' in prompt_lower or 'wind' in prompt_lower or 'rain' in prompt_lower:
+        # White noise for atmospheric sounds
+        audio = np.random.normal(0, 0.3, duration_samples)
+    elif 'pulse' in prompt_lower or 'beep' in prompt_lower:
+        # Square wave for electronic sounds
+        audio = 0.3 * np.sign(np.sin(2 * np.pi * base_freq * t))
+    else:
+        # Sine wave with optional vibrato
+        if vibrato_freq > 0:
+            modulated_freq = base_freq * (1 + vibrato_depth * np.sin(2 * np.pi * vibrato_freq * t))
+            audio = 0.3 * np.sin(2 * np.pi * np.cumsum(modulated_freq) * (t[1] - t[0]))
+        else:
+            audio = 0.3 * np.sin(2 * np.pi * base_freq * t)
+    # Add harmonics for richer sound
+    if 'rich' in prompt_lower or 'full' in prompt_lower or 'warm' in prompt_lower:
+        # Add octave higher harmonic
+        harmonic = 0.2 * np.sin(2 * np.pi * (base_freq * 2) * t)
+        audio += harmonic
+    # Add some natural variation
+    if 'natural' in prompt_lower or 'organic' in prompt_lower:
+        # Add slight random variation
+        variation = np.random.normal(0, 0.05, duration_samples)
+        audio += variation
+    # Normalize to prevent clipping
+    audio = np.clip(audio, -0.95, 0.95)
+    return (sample_rate, audio)
 def create_audio_generation_interface():
     """
     Create a Gradio interface for Stable Audio generation
     def generate_audio(prompt, duration, seed):
         """
+        Generate audio based on text prompt using Stable Audio model
         """
         try:
+            model = load_stable_audio_model()
+            if model == "placeholder":
+                # Fallback to placeholder if model loading failed
+                sample_rate = 44100
+                duration_samples = int(duration * sample_rate)
+                frequency = 440 + (seed % 200)  # Vary frequency based on seed
+                t = np.linspace(0, duration, duration_samples, endpoint=False)
+                audio = 0.3 * np.sin(2 * np.pi * frequency * t)
+                return (sample_rate, audio), "Using placeholder audio (model loading failed)"
+            # Set seed for reproducibility
+            if seed is not None:
+                torch.manual_seed(seed)
+                if torch.cuda.is_available():
+                    torch.cuda.manual_seed(seed)
+            # Generate audio with Stable Audio
+            print(f"Generating audio for prompt: '{prompt}', duration: {duration}s")
+            # Create negative prompt for better quality
+            negative_prompt = "low quality, distorted, noisy, artifacts"
+            try:
+                # Generate the audio with optimized parameters
+                audio_output = model(
+                    prompt=prompt,
+                    negative_prompt=negative_prompt,
+                    duration=duration,
+                    num_inference_steps=50,  # Reduced for faster generation
+                    guidance_scale=3.0,      # Reduced for stability
+                    num_waveforms_per_prompt=1,
+                )
+                # Extract the audio data
+                audio = audio_output.audios[0]  # Shape: [channels, samples]
+                # Convert to mono if stereo
+                if audio.ndim > 1:
+                    audio = audio.mean(axis=0)
+                # Ensure proper sample rate (Stable Audio uses 44100 Hz)
+                sample_rate = 44100
+                return (sample_rate, audio), "Audio generated successfully with Stable Audio!"
+            except Exception as gen_error:
+                print(f"Audio generation failed: {gen_error}")
+                # Fallback to simple synthesis
+                sample_rate = 44100
+                duration_samples = int(duration * sample_rate)
+                frequency = 440 + (hash(prompt) % 200)  # Vary based on prompt
+                t = np.linspace(0, duration, duration_samples, endpoint=False)
+                audio = 0.3 * np.sin(2 * np.pi * frequency * t)
+                return (sample_rate, audio), f"Model generation failed, using fallback synthesis"
         except Exception as e:
+            print(f"Error generating audio: {e}")
+            # Fallback to simple tone
+            sample_rate = 44100
+            duration_samples = int(duration * sample_rate)
+            frequency = 220  # A3 note
+            t = np.linspace(0, duration, duration_samples, endpoint=False)
+            audio = 0.3 * np.sin(2 * np.pi * frequency * t)
+            return (sample_rate, audio), f"Error: {str(e)}. Using fallback audio."
     # Create the Gradio interface
     with gr.Blocks(title="Stable Audio Open", theme=gr.themes.Soft()) as interface:
                 audio_output = gr.Audio(label="Generated Audio")
                 status_output = gr.Textbox(label="Status", interactive=False)
+            # Connect the generate button to the function
         generate_btn.click(
             fn=generate_audio,
             inputs=[prompt_input, duration_input, seed_input],
             outputs=[audio_output, status_output]
         )
+        # Add loading state
+        generate_btn.click(
+            fn=lambda: "🎵 Generating audio... Please wait.",
+            inputs=[],
+            outputs=[status_output],
+            queue=False
+        )
         # Add some example prompts
         gr.Examples(
             examples=[

requirements.txt CHANGED Viewed

@@ -1,7 +1,2 @@
-gradio>=4.0.0
-torch>=2.0.0
-transformers>=4.30.0
 numpy>=1.21.0
 scipy>=1.7.0
-accelerate>=0.20.0
-diffusers>=0.20.0





1	numpy>=1.21.0
2	scipy>=1.7.0