File size: 4,689 Bytes
dcbc4e8
a57ce2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a1ca09
a57ce2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a1ca09
a57ce2c
 
 
 
 
9a1ca09
a57ce2c
 
 
 
 
 
 
 
 
ab8268b
a57ce2c
 
 
45c7a65
a57ce2c
 
 
ab8268b
a57ce2c
 
45c7a65
a57ce2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f9ec40
dcbc4e8
c64a0b8
a57ce2c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import gradio as gr
import librosa
import numpy as np
import torch
from diffusers import StableDiffusionPipeline
import os
import gradio as gr
import sys

print(f"Gradio version: {gr.__version__}")
print(f"Gradio location: {gr.__file__}")
print(f"Python executable: {sys.executable}")

# Ensure that the script uses CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the Stable Diffusion model
model_id = "runwayml/stable-diffusion-v1-5"  # Updated model ID for better accessibility
try:
    stable_diffusion = StableDiffusionPipeline.from_pretrained(
        model_id,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32
    ).to(device)
except Exception as e:
    print(f"Error loading the model: {e}")
    print("Ensure you have the correct model ID and access rights.")
    exit(1)

def describe_audio(audio_path):
    """
    Generate a textual description based on audio features.

    Parameters:
        audio_path (str): Path to the audio file.

    Returns:
        str: Generated description.
    """
    try:
        # Load the audio file
        y, sr = librosa.load(audio_path, sr=None)

        # Extract Mel Spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        db_spec = librosa.power_to_db(S, ref=np.max)

        # Calculate average amplitude and frequency
        avg_amplitude = np.mean(db_spec)
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
        avg_frequency = np.mean(spectral_centroids)

        # Generate description based on amplitude
        if avg_amplitude < -40:
            amplitude_desc = "a calm and serene landscape with gentle waves"
        elif avg_amplitude < -20:
            amplitude_desc = "a vibrant forest with rustling leaves"
        else:
            amplitude_desc = "a thunderstorm with dark clouds and lightning"

        # Generate description based on frequency
        if avg_frequency < 2000:
            frequency_desc = "under soft, ambient light"
        elif avg_frequency < 4000:
            frequency_desc = "with vivid and lively colors"
        else:
            frequency_desc = "in a surreal and dynamic setting"

        # Combine descriptions
        description = f"{amplitude_desc} {frequency_desc}"
        return description
    except Exception as e:
        print(f"Error processing audio: {e}")
        return "an abstract artistic scene"

def generate_image(description):
    """
    Generate an image using the Stable Diffusion model based on the description.

    Parameters:
        description (str): Textual description for image generation.

    Returns:
        PIL.Image: Generated image.
    """
    try:
        if device == "cuda":
            with torch.autocast("cuda"):
                image = stable_diffusion(description).images[0]
        else:
            image = stable_diffusion(description).images[0]
        return image
    except Exception as e:
        print(f"Error generating image: {e}")
        return None

def audio_to_image(audio_file):
    """
    Convert an audio file to an artistic image.

    Parameters:
        audio_file (str): Path to the uploaded audio file.

    Returns:
        PIL.Image or str: Generated image or error message.
    """
    if audio_file is None:
        return "No audio file provided."

    description = describe_audio(audio_file)
    print(f"Generated Description: {description}")

    image = generate_image(description)
    if image is not None:
        return image
    else:
        return "Failed to generate image."

# Gradio Interface
title = "🎵 Audio to Artistic Image Converter 🎨"
description_text = """
Upload an audio file, and this app will generate an artistic image based on the sound's characteristics.
"""

# Define example paths
example_paths = [
    "example_audio/calm_ocean.wav",
    "example_audio/rustling_leaves.wav",
    "example_audio/thunderstorm.wav",
]

# Verify example files exist
valid_examples = []
for path in example_paths:
    if os.path.isfile(path):
        valid_examples.append([path])
    else:
        print(f"Example file not found: {path}")

if not os.path.exists("example_audio"):
    os.makedirs("example_audio")
    print("Please add some example audio files in the 'example_audio' directory.")

interface = gr.Interface(
    fn=audio_to_image,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Image(type="pil"),
    title=title,
    description=description_text,
    examples=valid_examples if valid_examples else None,
    allow_flagging="never",
    theme="default"
)

if __name__ == "__main__":
    interface.launch()