import gradio as gr import torch from diffusers import StableDiffusionPipeline from pydub import AudioSegment # === Use GPU if available === device = "cuda" if torch.cuda.is_available() else "cpu" # === Load model === model_id = "stabilityai/stable-diffusion-2-1" pipe = StableDiffusionPipeline.from_pretrained(model_id) pipe.to(device) # === Define function === def generate_image(audio, prompt): if audio is None: return None # Save audio temporarily audio_path = "train.wav" audio.export(audio_path, format="wav") result = pipe(prompt, guidance_scale=7.5, num_inference_steps=30).images[0] return result interface = gr.Interface( fn=generate_image, inputs=[ gr.Audio(source="upload", type="pydub"), gr.Textbox(label="Prompt", value="A surreal dreamscape made of music"), ], outputs=gr.Image(type="pil"), title="🎧 SonicDiffusion: Audio → Image Generator" ) interface.launch()