Spaces:

antonelli
/

outsidellms

Runtime error

File size: 6,472 Bytes

import gradio as gr
import librosa
import numpy as np
import requests
from gradio.outputs import Video

from video_generator import generate_video

def extract_lyrics(api_response):
    words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"]
    lyrics_with_timing = []
    CHUNK_DURATION = 10
    current_chunk = ""
    current_chunk_start_time = 0
    for word_info in words_timing:
        word = word_info["word"]
        start_time = word_info["start"]
        if start_time >= current_chunk_start_time + CHUNK_DURATION:
            end_time = word_info["end"]
            lyrics_with_timing.append((current_chunk_start_time, end_time, current_chunk.strip()))
            current_chunk = ""
            current_chunk_start_time += CHUNK_DURATION
        current_chunk += " " + word
    lyrics_with_timing.append((current_chunk_start_time, words_timing[-1]["end"], current_chunk.strip()))
    return lyrics_with_timing


def send_to_deepgram(audio_file_path):
    # Update with your Deepgram API endpoint and key
    endpoint = "https://api.deepgram.com/v1/listen"
    headers = {
    "Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7"
    }
    with open(audio_file_path, 'rb') as audio_file  :
        audio_data = audio_file.read()
    
    response = requests.post(endpoint, headers=headers, data=audio_data)
    response_json = response.json()
    print("Deepgram API Response:", response_json)  # Log the response
    return response_json

def analyze_audio(audio_file_path):
    print("Analyzing audio...")  # Log start of analysis
    last_frame = None
    y, sr = librosa.load(audio_file_path)
    chunk_length = 10 * sr # 10 seconds
    moods = []
    
    deepgram_response = send_to_deepgram(audio_file_path)
    lyrics_chunks = extract_lyrics(deepgram_response)

    for start in range(0, len(y), chunk_length):
        chunk = y[start:start + chunk_length]
        mood, _, _, _, _, _ = analyze_chunk(chunk)
        moods.append(mood)

    for i, start in enumerate(range(0, len(y), chunk_length)):
        print(f"Analyzing chunk {i + 1}...")  # Log chunk analysis
        chunk = y[start:start + chunk_length]
        lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence'
        previous_mood = moods[i - 1] if i > 0 else None
        current_mood = moods[i]
        next_mood = moods[i + 1] if i < len(moods) - 1 else None
        _, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk)
        prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary)
        description = f"Chunk starting at {start / sr} seconds:<br>Mood: {current_mood}<br>Video Prompt: {prompt}<br><br>"
        print(f"Generating video for chunk {i + 1}...")
        lyrics_with_timing = extract_lyrics(deepgram_response)
        video = generate_video(lyrics_with_timing, last_frame)
        #last_frame = extract_last_frame(video)
        print(f"Description for chunk {i + 1}: {description}")
        print(f"Video for chunk {i + 1}: {video}")

        # Yield the result for this chunk
        yield (description, video)
  
def analyze_chunk(chunk):
    tempo, _ = librosa.beat.beat_track(y=chunk)
    chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk))
    spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk))
    zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk))
    mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk))
    mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean)
    return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean

def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean):
    # Happy Mood
    if tempo > 110 and chroma_mean > 0.4:
        return 'Happy'
    # Sad Mood
    elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0:
        return 'Sad'
    # Energetic Mood
    elif tempo > 130 and zero_crossing_rate_mean > 0.05:
        return 'Energetic'
    # Relaxed Mood
    elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15:
        return 'Relaxed'
    # Romantic Mood
    elif tempo < 100 and chroma_mean > 0.5:
        return 'Romantic'
    # Nostalgic Mood
    elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25:
        return 'Nostalgic'
    # Tense Mood
    elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20:
        return 'Tense'
    # Dreamy Mood
    elif tempo < 80 and chroma_mean > 0.4:
        return 'Dreamy'
    # Aggressive Mood
    elif tempo > 140 and zero_crossing_rate_mean > 0.08:
        return 'Aggressive'
    # Neutral Mood (Catch-all)
    else:
        return 'Neutral'

def describe_tempo(tempo):
    if tempo < 60:
        return "a very slow"
    elif tempo < 90:
        return "a slow"
    elif tempo < 120:
        return "a moderate"
    elif tempo < 150:
        return "a lively"
    else:
        return "a fast"

def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary):
    rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm"
    tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones"
    spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts"
    tempo_description = describe_tempo(tempo)

    transition_description = ""
    if previous_mood:
        transition_description += f"Transition from a {previous_mood.lower()} mood. "
    if next_mood:
        transition_description += f"Prepare to transition to a {next_mood.lower()} mood. "

    prompt = (
        f"Essence of a {current_mood.lower()} mood. "
        f"{transition_description}"
        f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. "
        f"Visualize {tempo_description} tempo. "  # Updated line
        f"Narrative based on the lyrics: '{lyrics_summary}'. "
        f"Emphasize the themes and emotions conveyed in the song."
    )

    return prompt

# Define Gradio interface
gr.Interface(
    fn=analyze_audio, 
    inputs=gr.Audio(type="filepath"),
    outputs=[gr.HTML(), Video()],
).launch()