import gradio as gr import librosa import numpy as np import torch from diffusers import StableDiffusionPipeline import os import gradio as gr import sys print(f"Gradio version: {gr.__version__}") print(f"Gradio location: {gr.__file__}") print(f"Python executable: {sys.executable}") # Ensure that the script uses CUDA if available device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load the Stable Diffusion model model_id = "runwayml/stable-diffusion-v1-5" # Updated model ID for better accessibility try: stable_diffusion = StableDiffusionPipeline.from_pretrained( model_id, torch_dtype=torch.float16 if device == "cuda" else torch.float32 ).to(device) except Exception as e: print(f"Error loading the model: {e}") print("Ensure you have the correct model ID and access rights.") exit(1) def describe_audio(audio_path): """ Generate a textual description based on audio features. Parameters: audio_path (str): Path to the audio file. Returns: str: Generated description. """ try: # Load the audio file y, sr = librosa.load(audio_path, sr=None) # Extract Mel Spectrogram S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) db_spec = librosa.power_to_db(S, ref=np.max) # Calculate average amplitude and frequency avg_amplitude = np.mean(db_spec) spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr) avg_frequency = np.mean(spectral_centroids) # Generate description based on amplitude if avg_amplitude < -40: amplitude_desc = "a calm and serene landscape with gentle waves" elif avg_amplitude < -20: amplitude_desc = "a vibrant forest with rustling leaves" else: amplitude_desc = "a thunderstorm with dark clouds and lightning" # Generate description based on frequency if avg_frequency < 2000: frequency_desc = "under soft, ambient light" elif avg_frequency < 4000: frequency_desc = "with vivid and lively colors" else: frequency_desc = "in a surreal and dynamic setting" # Combine descriptions description = f"{amplitude_desc} {frequency_desc}" return description except Exception as e: print(f"Error processing audio: {e}") return "an abstract artistic scene" def generate_image(description): """ Generate an image using the Stable Diffusion model based on the description. Parameters: description (str): Textual description for image generation. Returns: PIL.Image: Generated image. """ try: if device == "cuda": with torch.autocast("cuda"): image = stable_diffusion(description).images[0] else: image = stable_diffusion(description).images[0] return image except Exception as e: print(f"Error generating image: {e}") return None def audio_to_image(audio_file): """ Convert an audio file to an artistic image. Parameters: audio_file (str): Path to the uploaded audio file. Returns: PIL.Image or str: Generated image or error message. """ if audio_file is None: return "No audio file provided." description = describe_audio(audio_file) print(f"Generated Description: {description}") image = generate_image(description) if image is not None: return image else: return "Failed to generate image." # Gradio Interface title = "🎵 Audio to Artistic Image Converter 🎨" description_text = """ Upload an audio file, and this app will generate an artistic image based on the sound's characteristics. """ # Define example paths example_paths = [ "example_audio/calm_ocean.wav", "example_audio/rustling_leaves.wav", "example_audio/thunderstorm.wav", ] # Verify example files exist valid_examples = [] for path in example_paths: if os.path.isfile(path): valid_examples.append([path]) else: print(f"Example file not found: {path}") if not os.path.exists("example_audio"): os.makedirs("example_audio") print("Please add some example audio files in the 'example_audio' directory.") interface = gr.Interface( fn=audio_to_image, inputs=gr.Audio(source="upload", type="filepath"), outputs=gr.Image(type="pil"), title=title, description=description_text, examples=valid_examples if valid_examples else None, allow_flagging="never", theme="default" ) if __name__ == "__main__": interface.launch()