import gradio as gr import torch from diffusers import AudioLDMPipeline print("Loading AudioLDM Model...") repo_id = "cvssp/audioldm-s-full-v2" pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float32) def generate_audio(text_prompt, duration, guidance): print(f"Generating sound for: {text_prompt}") # --- THE FIX: Increased steps & added a Negative Prompt --- audio = pipe( prompt=text_prompt, negative_prompt="low quality, static, noise, distorted, background noise, messy", # Cleans the audio num_inference_steps=25, # Increased from 15 to 25 for better quality audio_length_in_s=duration, guidance_scale=guidance ).audios[0] return (16000, audio) with gr.Blocks() as app: gr.Markdown("# 🎬 The Foley Artist AI") gr.Markdown("Generate high-fidelity sound effects from text descriptions.") with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Describe the Sound", lines=2, placeholder="e.g., A dog barking loudly in an empty room") duration = gr.Slider(minimum=2.5, maximum=5.0, value=2.5, step=2.5, label="Duration (Seconds)") # Tweaked the default guidance scale to 3.0 which often works better guidance = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, label="Guidance Scale (How strictly to follow text)") generate_btn = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated Sound Effect") generate_btn.click( generate_audio, inputs=[prompt, duration, guidance], outputs=[audio_output] ) app.launch(ssr_mode=False)