File size: 7,842 Bytes
b98e447 ce13058 1902030 ce13058 1902030 ce13058 b98e447 ce13058 b98e447 ce13058 1902030 ce13058 1902030 ce13058 1902030 ce13058 1902030 ce13058 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import gradio as gr
from pathlib import Path
import yt_dlp
import logging
import librosa
import numpy as np
from PIL import Image
import ffmpeg
import shutil
import tempfile
import time
# Set up logging for debugging
logging.basicConfig(level=logging.DEBUG)
def analyze_audio(youtube_url, input_text, input_image=None, slider_value=50, checkbox_value=False):
"""
Downloads YouTube audio, performs automatic audio feature analysis with librosa, and processes inputs.
Automatically handles file and folder management.
Args:
youtube_url (str): YouTube video URL (optional).
input_text (str): Text input for processing.
input_image (PIL.Image, optional): Image input for processing.
slider_value (float): Numerical parameter (e.g., analysis threshold).
checkbox_value (bool): Toggle for enhanced analysis.
Returns:
tuple: (processed_text, output_image_display, output_audio, extra_info)
"""
# Create a unique temporary directory for this run
temp_dir = Path(tempfile.mkdtemp(prefix="audio_analysis_"))
output_dir = temp_dir / "downloaded_media"
output_dir.mkdir(parents=True, exist_ok=True)
logging.debug(f"Created temporary directory: {temp_dir}, output directory: {output_dir}")
try:
# Initialize outputs
processed_text = f"Processed: '{input_text}'."
output_image_display = input_image
output_audio = None
extra_info = f"Threshold: {slider_value/100:.2f}"
# Handle YouTube download if URL is provided
if youtube_url:
try:
# Validate YouTube URL
if not youtube_url.startswith(("https://www.youtube.com/", "https://youtu.be/")):
return "Error: Invalid YouTube URL", None, None, "Processing failed."
# YouTube download options (audio only)
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': str(output_dir / '%(title)s.%(ext)s'),
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'restrictfilenames': True,
'noplaylist': True,
}
# Download audio
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(youtube_url, download=True)
audio_file = output_dir / f"{info['title']}.mp3"
logging.debug(f"Downloaded audio: {audio_file}")
output_audio = str(audio_file)
# Perform automatic audio feature analysis with librosa
y, sr = librosa.load(audio_file)
hop_length = 512 # Valid hop_length to fix "Invalid hop_length: 0" error
logging.debug(f"Using hop_length: {hop_length}")
# Extract features
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length)
tempo, _ = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length)
# Aggregate features
mfcc_mean = np.mean(mfcc, axis=1).tolist()[:3] # Mean of first 3 MFCC coefficients
spectral_centroid_mean = np.mean(spectral_centroid)
features_summary = (
f"Audio Features: MFCC (mean of first 3 coeffs): {mfcc_mean}, "
f"Spectral Centroid: {spectral_centroid_mean:.2f} Hz, "
f"Tempo: {tempo:.2f} BPM"
)
processed_text += f" {features_summary}."
extra_info += f", Audio: {audio_file.name}"
except Exception as e:
logging.error(f"YouTube download or audio processing error: {str(e)}")
processed_text += f" Error processing YouTube audio: {str(e)}."
# Handle image processing if provided
if input_image is not None:
from PIL import ImageEnhance
enhancer = ImageEnhance.Brightness(input_image)
output_image_display = enhancer.enhance(1.5)
processed_text += " Image processed (brightened)."
else:
processed_text += " No image provided."
# Incorporate slider and checkbox
processed_text += f" Slider: {slider_value}, Enhanced Analysis: {checkbox_value}."
if checkbox_value:
processed_text += " Enhanced analysis enabled."
if youtube_url and slider_value > 50:
processed_text += f" High threshold ({slider_value}) applied for deeper analysis."
return processed_text, output_image_display, output_audio, extra_info
except Exception as e:
logging.error(f"Error in analyze_audio: {str(e)}")
return f"Error: {str(e)}", None, None, "Processing failed."
finally:
# Clean up temporary directory after a delay to ensure file access
try:
time.sleep(1) # Brief delay to ensure Gradio can serve the audio file
if temp_dir.exists():
shutil.rmtree(temp_dir)
logging.debug(f"Cleaned up temporary directory: {temp_dir}")
except Exception as e:
logging.error(f"Error cleaning up temporary directory: {str(e)}")
# Define input components
input_youtube_url = gr.Textbox(
label="YouTube Video URL",
placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ",
info="Optional: Enter a YouTube URL to download and analyze audio."
)
input_text_component = gr.Textbox(
label="Input Text",
placeholder="e.g., Analyze this audio track",
info="Type a description or query for processing."
)
input_image_component = gr.Image(
type="pil",
label="Upload Image (Optional)",
sources=["upload", "webcam", "clipboard"]
)
input_slider_component = gr.Slider(
minimum=0,
maximum=100,
value=50,
step=1,
label="Analysis Threshold",
info="Adjusts sensitivity of audio feature analysis."
)
input_checkbox_component = gr.Checkbox(
label="Enable Enhanced Analysis",
info="Toggle for deeper audio feature extraction."
)
# Define output components
output_text_component = gr.Textbox(
label="Analysis Results",
info="Text results including audio feature analysis."
)
output_image_component = gr.Image(
label="Processed Image (if any)",
info="Processed image output (if provided)."
)
output_audio_component = gr.Audio(
label="Downloaded Audio",
type="filepath",
info="Audio downloaded from YouTube."
)
output_label_component = gr.Label(
label="Analysis Summary",
info="Feature analysis details and processing info."
)
# Create the Gradio interface
iface = gr.Interface(
fn=analyze_audio,
inputs=[
input_youtube_url,
input_text_component,
input_image_component,
input_slider_component,
input_checkbox_component
],
outputs=[
output_text_component,
output_image_component,
output_audio_component,
output_label_component
],
title="YouTube Audio Feature Analysis",
description="Download YouTube audio, analyze features with librosa, and process text/image inputs. Customize with slider and checkbox.",
examples=[
["https://www.youtube.com/watch?v=dQw4w9WgXcQ", "Analyze this track", None, 75, True],
[None, "Describe a music track", None, 30, False],
["https://www.youtube.com/watch?v=9bZkp7q19f0", "Extract audio features", None, 60, True]
],
allow_flagging="never",
theme=gr.themes.Soft()
)
if __name__ == "__main__":
iface.launch() |