Spaces:

latterworks
/

A

Sleeping

File size: 7,842 Bytes

import gradio as gr
from pathlib import Path
import yt_dlp
import logging
import librosa
import numpy as np
from PIL import Image
import ffmpeg
import shutil
import tempfile
import time

# Set up logging for debugging
logging.basicConfig(level=logging.DEBUG)

def analyze_audio(youtube_url, input_text, input_image=None, slider_value=50, checkbox_value=False):
    """
    Downloads YouTube audio, performs automatic audio feature analysis with librosa, and processes inputs.
    Automatically handles file and folder management.

    Args:
        youtube_url (str): YouTube video URL (optional).
        input_text (str): Text input for processing.
        input_image (PIL.Image, optional): Image input for processing.
        slider_value (float): Numerical parameter (e.g., analysis threshold).
        checkbox_value (bool): Toggle for enhanced analysis.

    Returns:
        tuple: (processed_text, output_image_display, output_audio, extra_info)
    """
    # Create a unique temporary directory for this run
    temp_dir = Path(tempfile.mkdtemp(prefix="audio_analysis_"))
    output_dir = temp_dir / "downloaded_media"
    output_dir.mkdir(parents=True, exist_ok=True)
    logging.debug(f"Created temporary directory: {temp_dir}, output directory: {output_dir}")

    try:
        # Initialize outputs
        processed_text = f"Processed: '{input_text}'."
        output_image_display = input_image
        output_audio = None
        extra_info = f"Threshold: {slider_value/100:.2f}"

        # Handle YouTube download if URL is provided
        if youtube_url:
            try:
                # Validate YouTube URL
                if not youtube_url.startswith(("https://www.youtube.com/", "https://youtu.be/")):
                    return "Error: Invalid YouTube URL", None, None, "Processing failed."

                # YouTube download options (audio only)
                ydl_opts = {
                    'format': 'bestaudio/best',
                    'outtmpl': str(output_dir / '%(title)s.%(ext)s'),
                    'postprocessors': [{
                        'key': 'FFmpegExtractAudio',
                        'preferredcodec': 'mp3',
                        'preferredquality': '192',
                    }],
                    'restrictfilenames': True,
                    'noplaylist': True,
                }

                # Download audio
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    info = ydl.extract_info(youtube_url, download=True)
                    audio_file = output_dir / f"{info['title']}.mp3"
                    logging.debug(f"Downloaded audio: {audio_file}")
                    output_audio = str(audio_file)

                # Perform automatic audio feature analysis with librosa
                y, sr = librosa.load(audio_file)
                hop_length = 512  # Valid hop_length to fix "Invalid hop_length: 0" error
                logging.debug(f"Using hop_length: {hop_length}")

                # Extract features
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
                spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length)
                tempo, _ = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length)

                # Aggregate features
                mfcc_mean = np.mean(mfcc, axis=1).tolist()[:3]  # Mean of first 3 MFCC coefficients
                spectral_centroid_mean = np.mean(spectral_centroid)
                features_summary = (
                    f"Audio Features: MFCC (mean of first 3 coeffs): {mfcc_mean}, "
                    f"Spectral Centroid: {spectral_centroid_mean:.2f} Hz, "
                    f"Tempo: {tempo:.2f} BPM"
                )

                processed_text += f" {features_summary}."
                extra_info += f", Audio: {audio_file.name}"

            except Exception as e:
                logging.error(f"YouTube download or audio processing error: {str(e)}")
                processed_text += f" Error processing YouTube audio: {str(e)}."

        # Handle image processing if provided
        if input_image is not None:
            from PIL import ImageEnhance
            enhancer = ImageEnhance.Brightness(input_image)
            output_image_display = enhancer.enhance(1.5)
            processed_text += " Image processed (brightened)."
        else:
            processed_text += " No image provided."

        # Incorporate slider and checkbox
        processed_text += f" Slider: {slider_value}, Enhanced Analysis: {checkbox_value}."
        if checkbox_value:
            processed_text += " Enhanced analysis enabled."
            if youtube_url and slider_value > 50:
                processed_text += f" High threshold ({slider_value}) applied for deeper analysis."

        return processed_text, output_image_display, output_audio, extra_info

    except Exception as e:
        logging.error(f"Error in analyze_audio: {str(e)}")
        return f"Error: {str(e)}", None, None, "Processing failed."

    finally:
        # Clean up temporary directory after a delay to ensure file access
        try:
            time.sleep(1)  # Brief delay to ensure Gradio can serve the audio file
            if temp_dir.exists():
                shutil.rmtree(temp_dir)
                logging.debug(f"Cleaned up temporary directory: {temp_dir}")
        except Exception as e:
            logging.error(f"Error cleaning up temporary directory: {str(e)}")

# Define input components
input_youtube_url = gr.Textbox(
    label="YouTube Video URL",
    placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ",
    info="Optional: Enter a YouTube URL to download and analyze audio."
)
input_text_component = gr.Textbox(
    label="Input Text",
    placeholder="e.g., Analyze this audio track",
    info="Type a description or query for processing."
)
input_image_component = gr.Image(
    type="pil",
    label="Upload Image (Optional)",
    sources=["upload", "webcam", "clipboard"]
)
input_slider_component = gr.Slider(
    minimum=0,
    maximum=100,
    value=50,
    step=1,
    label="Analysis Threshold",
    info="Adjusts sensitivity of audio feature analysis."
)
input_checkbox_component = gr.Checkbox(
    label="Enable Enhanced Analysis",
    info="Toggle for deeper audio feature extraction."
)

# Define output components
output_text_component = gr.Textbox(
    label="Analysis Results",
    info="Text results including audio feature analysis."
)
output_image_component = gr.Image(
    label="Processed Image (if any)",
    info="Processed image output (if provided)."
)
output_audio_component = gr.Audio(
    label="Downloaded Audio",
    type="filepath",
    info="Audio downloaded from YouTube."
)
output_label_component = gr.Label(
    label="Analysis Summary",
    info="Feature analysis details and processing info."
)

# Create the Gradio interface
iface = gr.Interface(
    fn=analyze_audio,
    inputs=[
        input_youtube_url,
        input_text_component,
        input_image_component,
        input_slider_component,
        input_checkbox_component
    ],
    outputs=[
        output_text_component,
        output_image_component,
        output_audio_component,
        output_label_component
    ],
    title="YouTube Audio Feature Analysis",
    description="Download YouTube audio, analyze features with librosa, and process text/image inputs. Customize with slider and checkbox.",
    examples=[
        ["https://www.youtube.com/watch?v=dQw4w9WgXcQ", "Analyze this track", None, 75, True],
        [None, "Describe a music track", None, 30, False],
        ["https://www.youtube.com/watch?v=9bZkp7q19f0", "Extract audio features", None, 60, True]
    ],
    allow_flagging="never",
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    iface.launch()