A
File size: 7,842 Bytes
b98e447
 
ce13058
1902030
ce13058
 
 
 
1902030
ce13058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b98e447
ce13058
 
 
 
b98e447
ce13058
1902030
ce13058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1902030
ce13058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1902030
ce13058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1902030
 
ce13058
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import gradio as gr
from pathlib import Path
import yt_dlp
import logging
import librosa
import numpy as np
from PIL import Image
import ffmpeg
import shutil
import tempfile
import time

# Set up logging for debugging
logging.basicConfig(level=logging.DEBUG)

def analyze_audio(youtube_url, input_text, input_image=None, slider_value=50, checkbox_value=False):
    """
    Downloads YouTube audio, performs automatic audio feature analysis with librosa, and processes inputs.
    Automatically handles file and folder management.

    Args:
        youtube_url (str): YouTube video URL (optional).
        input_text (str): Text input for processing.
        input_image (PIL.Image, optional): Image input for processing.
        slider_value (float): Numerical parameter (e.g., analysis threshold).
        checkbox_value (bool): Toggle for enhanced analysis.

    Returns:
        tuple: (processed_text, output_image_display, output_audio, extra_info)
    """
    # Create a unique temporary directory for this run
    temp_dir = Path(tempfile.mkdtemp(prefix="audio_analysis_"))
    output_dir = temp_dir / "downloaded_media"
    output_dir.mkdir(parents=True, exist_ok=True)
    logging.debug(f"Created temporary directory: {temp_dir}, output directory: {output_dir}")

    try:
        # Initialize outputs
        processed_text = f"Processed: '{input_text}'."
        output_image_display = input_image
        output_audio = None
        extra_info = f"Threshold: {slider_value/100:.2f}"

        # Handle YouTube download if URL is provided
        if youtube_url:
            try:
                # Validate YouTube URL
                if not youtube_url.startswith(("https://www.youtube.com/", "https://youtu.be/")):
                    return "Error: Invalid YouTube URL", None, None, "Processing failed."

                # YouTube download options (audio only)
                ydl_opts = {
                    'format': 'bestaudio/best',
                    'outtmpl': str(output_dir / '%(title)s.%(ext)s'),
                    'postprocessors': [{
                        'key': 'FFmpegExtractAudio',
                        'preferredcodec': 'mp3',
                        'preferredquality': '192',
                    }],
                    'restrictfilenames': True,
                    'noplaylist': True,
                }

                # Download audio
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    info = ydl.extract_info(youtube_url, download=True)
                    audio_file = output_dir / f"{info['title']}.mp3"
                    logging.debug(f"Downloaded audio: {audio_file}")
                    output_audio = str(audio_file)

                # Perform automatic audio feature analysis with librosa
                y, sr = librosa.load(audio_file)
                hop_length = 512  # Valid hop_length to fix "Invalid hop_length: 0" error
                logging.debug(f"Using hop_length: {hop_length}")

                # Extract features
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
                spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length)
                tempo, _ = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length)

                # Aggregate features
                mfcc_mean = np.mean(mfcc, axis=1).tolist()[:3]  # Mean of first 3 MFCC coefficients
                spectral_centroid_mean = np.mean(spectral_centroid)
                features_summary = (
                    f"Audio Features: MFCC (mean of first 3 coeffs): {mfcc_mean}, "
                    f"Spectral Centroid: {spectral_centroid_mean:.2f} Hz, "
                    f"Tempo: {tempo:.2f} BPM"
                )

                processed_text += f" {features_summary}."
                extra_info += f", Audio: {audio_file.name}"

            except Exception as e:
                logging.error(f"YouTube download or audio processing error: {str(e)}")
                processed_text += f" Error processing YouTube audio: {str(e)}."

        # Handle image processing if provided
        if input_image is not None:
            from PIL import ImageEnhance
            enhancer = ImageEnhance.Brightness(input_image)
            output_image_display = enhancer.enhance(1.5)
            processed_text += " Image processed (brightened)."
        else:
            processed_text += " No image provided."

        # Incorporate slider and checkbox
        processed_text += f" Slider: {slider_value}, Enhanced Analysis: {checkbox_value}."
        if checkbox_value:
            processed_text += " Enhanced analysis enabled."
            if youtube_url and slider_value > 50:
                processed_text += f" High threshold ({slider_value}) applied for deeper analysis."

        return processed_text, output_image_display, output_audio, extra_info

    except Exception as e:
        logging.error(f"Error in analyze_audio: {str(e)}")
        return f"Error: {str(e)}", None, None, "Processing failed."

    finally:
        # Clean up temporary directory after a delay to ensure file access
        try:
            time.sleep(1)  # Brief delay to ensure Gradio can serve the audio file
            if temp_dir.exists():
                shutil.rmtree(temp_dir)
                logging.debug(f"Cleaned up temporary directory: {temp_dir}")
        except Exception as e:
            logging.error(f"Error cleaning up temporary directory: {str(e)}")

# Define input components
input_youtube_url = gr.Textbox(
    label="YouTube Video URL",
    placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ",
    info="Optional: Enter a YouTube URL to download and analyze audio."
)
input_text_component = gr.Textbox(
    label="Input Text",
    placeholder="e.g., Analyze this audio track",
    info="Type a description or query for processing."
)
input_image_component = gr.Image(
    type="pil",
    label="Upload Image (Optional)",
    sources=["upload", "webcam", "clipboard"]
)
input_slider_component = gr.Slider(
    minimum=0,
    maximum=100,
    value=50,
    step=1,
    label="Analysis Threshold",
    info="Adjusts sensitivity of audio feature analysis."
)
input_checkbox_component = gr.Checkbox(
    label="Enable Enhanced Analysis",
    info="Toggle for deeper audio feature extraction."
)

# Define output components
output_text_component = gr.Textbox(
    label="Analysis Results",
    info="Text results including audio feature analysis."
)
output_image_component = gr.Image(
    label="Processed Image (if any)",
    info="Processed image output (if provided)."
)
output_audio_component = gr.Audio(
    label="Downloaded Audio",
    type="filepath",
    info="Audio downloaded from YouTube."
)
output_label_component = gr.Label(
    label="Analysis Summary",
    info="Feature analysis details and processing info."
)

# Create the Gradio interface
iface = gr.Interface(
    fn=analyze_audio,
    inputs=[
        input_youtube_url,
        input_text_component,
        input_image_component,
        input_slider_component,
        input_checkbox_component
    ],
    outputs=[
        output_text_component,
        output_image_component,
        output_audio_component,
        output_label_component
    ],
    title="YouTube Audio Feature Analysis",
    description="Download YouTube audio, analyze features with librosa, and process text/image inputs. Customize with slider and checkbox.",
    examples=[
        ["https://www.youtube.com/watch?v=dQw4w9WgXcQ", "Analyze this track", None, 75, True],
        [None, "Describe a music track", None, 30, False],
        ["https://www.youtube.com/watch?v=9bZkp7q19f0", "Extract audio features", None, 60, True]
    ],
    allow_flagging="never",
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    iface.launch()