Spaces:

dskill
/

sd-audio-cpu

Runtime error

App Files Files Community

Drew commited on Jun 20, 2024

Commit

c879843

1 Parent(s): 929aad9

tests

Browse files

Files changed (2) hide show

app.py +10 -44
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import gradio as gr
 import spaces
 import os
 import uuid
 # Importing the model-related functions
 from stable_audio_tools import get_pretrained_model
@@ -79,8 +80,15 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
     torchaudio.save(unique_filename, output, sample_rate)
     print(f"Audio saved: {unique_filename}")
     # Return the path to the generated audio file
-    return unique_filename
 # Setting up the Gradio Interface
 interface = gr.Interface(
@@ -94,49 +102,7 @@ interface = gr.Interface(
     outputs=gr.Audio(type="filepath", label="Generated Audio"),
     title="Stable Audio Generator",
     description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.",
-    examples=[
-    [
-        "Create a serene soundscape of a quiet beach at sunset.",  # Text prompt
-        45,  # Duration in Seconds
-        100,  # Number of Diffusion Steps
-        10,  # CFG Scale
-    ],
-    [
-        "Generate an energetic and bustling city street scene with distant traffic and close conversations.",  # Text prompt
-        30,  # Duration in Seconds
-        120,  # Number of Diffusion Steps
-        5,  # CFG Scale
-    ],
-    [
-        "Simulate a forest ambiance with birds chirping and wind rustling through the leaves.",  # Text prompt
-        60,  # Duration in Seconds
-        140,  # Number of Diffusion Steps
-        7.5,  # CFG Scale
-    ],
-    [
-        "Recreate a gentle rainfall with distant thunder.",  # Text prompt
-        35,  # Duration in Seconds
-        110,  # Number of Diffusion Steps
-        8,  # CFG Scale
-    ],
-    [
-        "Imagine a jazz cafe environment with soft music and ambient chatter.",  # Text prompt
-        25,  # Duration in Seconds
-        90,  # Number of Diffusion Steps
-        6,  # CFG Scale
-    ],
-    ["Rock beat played in a treated studio, session drumming on an acoustic kit.",
-        30,  # Duration in Seconds
-        100,  # Number of Diffusion Steps
-        7,  # CFG Scale
-    ]
-])
 # Pre-load the model to avoid multiprocessing issues

 import spaces
 import os
 import uuid
+from pydub import AudioSegment
 # Importing the model-related functions
 from stable_audio_tools import get_pretrained_model
     torchaudio.save(unique_filename, output, sample_rate)
     print(f"Audio saved: {unique_filename}")
+    # Convert WAV to MP3 using pydub without ffmpeg
+    audio = AudioSegment.from_wav(unique_filename)
+    full_path_mp3 = unique_filename.replace('wav', 'mp3')
+    audio.export(full_path_mp3, format="mp3")
+    print(f"Audio converted and saved to MP3: {full_path_mp3}")
     # Return the path to the generated audio file
+    return audio
 # Setting up the Gradio Interface
 interface = gr.Interface(
     outputs=gr.Audio(type="filepath", label="Generated Audio"),
     title="Stable Audio Generator",
     description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0.",
+)
 # Pre-load the model to avoid multiprocessing issues

requirements.txt CHANGED Viewed

@@ -9,4 +9,8 @@ torch
 torchaudio
 stable-audio-tools
 openai
-pydub

 torchaudio
 stable-audio-tools
 openai
+pydub
+git+https://github.com/huggingface/diffusers.git
+transformers
+accelerate
+sentencepiece