File size: 6,472 Bytes
80b6d17
 
 
 
 
 
00a70f8
80b6d17
 
31b1379
ce3218c
 
 
 
31b1379
 
 
 
ce3218c
31b1379
 
 
 
 
 
ce3218c
80b6d17
 
 
 
 
 
 
 
 
 
 
8a709b9
 
 
80b6d17
 
8a709b9
80b6d17
 
 
 
 
 
 
 
 
 
 
 
 
 
8a709b9
80b6d17
 
 
 
 
 
 
 
8a709b9
31b1379
 
e33689b
8a709b9
 
80b6d17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0a6f55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
import librosa
import numpy as np
import requests
from gradio.outputs import Video

from video_generator import generate_video

def extract_lyrics(api_response):
    words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"]
    lyrics_with_timing = []
    CHUNK_DURATION = 10
    current_chunk = ""
    current_chunk_start_time = 0
    for word_info in words_timing:
        word = word_info["word"]
        start_time = word_info["start"]
        if start_time >= current_chunk_start_time + CHUNK_DURATION:
            end_time = word_info["end"]
            lyrics_with_timing.append((current_chunk_start_time, end_time, current_chunk.strip()))
            current_chunk = ""
            current_chunk_start_time += CHUNK_DURATION
        current_chunk += " " + word
    lyrics_with_timing.append((current_chunk_start_time, words_timing[-1]["end"], current_chunk.strip()))
    return lyrics_with_timing


def send_to_deepgram(audio_file_path):
    # Update with your Deepgram API endpoint and key
    endpoint = "https://api.deepgram.com/v1/listen"
    headers = {
    "Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7"
    }
    with open(audio_file_path, 'rb') as audio_file  :
        audio_data = audio_file.read()
    
    response = requests.post(endpoint, headers=headers, data=audio_data)
    response_json = response.json()
    print("Deepgram API Response:", response_json)  # Log the response
    return response_json

def analyze_audio(audio_file_path):
    print("Analyzing audio...")  # Log start of analysis
    last_frame = None
    y, sr = librosa.load(audio_file_path)
    chunk_length = 10 * sr # 10 seconds
    moods = []
    
    deepgram_response = send_to_deepgram(audio_file_path)
    lyrics_chunks = extract_lyrics(deepgram_response)

    for start in range(0, len(y), chunk_length):
        chunk = y[start:start + chunk_length]
        mood, _, _, _, _, _ = analyze_chunk(chunk)
        moods.append(mood)

    for i, start in enumerate(range(0, len(y), chunk_length)):
        print(f"Analyzing chunk {i + 1}...")  # Log chunk analysis
        chunk = y[start:start + chunk_length]
        lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence'
        previous_mood = moods[i - 1] if i > 0 else None
        current_mood = moods[i]
        next_mood = moods[i + 1] if i < len(moods) - 1 else None
        _, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk)
        prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary)
        description = f"Chunk starting at {start / sr} seconds:<br>Mood: {current_mood}<br>Video Prompt: {prompt}<br><br>"
        print(f"Generating video for chunk {i + 1}...")
        lyrics_with_timing = extract_lyrics(deepgram_response)
        video = generate_video(lyrics_with_timing, last_frame)
        #last_frame = extract_last_frame(video)
        print(f"Description for chunk {i + 1}: {description}")
        print(f"Video for chunk {i + 1}: {video}")

        # Yield the result for this chunk
        yield (description, video)
  
def analyze_chunk(chunk):
    tempo, _ = librosa.beat.beat_track(y=chunk)
    chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk))
    spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk))
    zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk))
    mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk))
    mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean)
    return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean

def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean):
    # Happy Mood
    if tempo > 110 and chroma_mean > 0.4:
        return 'Happy'
    # Sad Mood
    elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0:
        return 'Sad'
    # Energetic Mood
    elif tempo > 130 and zero_crossing_rate_mean > 0.05:
        return 'Energetic'
    # Relaxed Mood
    elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15:
        return 'Relaxed'
    # Romantic Mood
    elif tempo < 100 and chroma_mean > 0.5:
        return 'Romantic'
    # Nostalgic Mood
    elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25:
        return 'Nostalgic'
    # Tense Mood
    elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20:
        return 'Tense'
    # Dreamy Mood
    elif tempo < 80 and chroma_mean > 0.4:
        return 'Dreamy'
    # Aggressive Mood
    elif tempo > 140 and zero_crossing_rate_mean > 0.08:
        return 'Aggressive'
    # Neutral Mood (Catch-all)
    else:
        return 'Neutral'

def describe_tempo(tempo):
    if tempo < 60:
        return "a very slow"
    elif tempo < 90:
        return "a slow"
    elif tempo < 120:
        return "a moderate"
    elif tempo < 150:
        return "a lively"
    else:
        return "a fast"

def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary):
    rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm"
    tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones"
    spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts"
    tempo_description = describe_tempo(tempo)

    transition_description = ""
    if previous_mood:
        transition_description += f"Transition from a {previous_mood.lower()} mood. "
    if next_mood:
        transition_description += f"Prepare to transition to a {next_mood.lower()} mood. "

    prompt = (
        f"Essence of a {current_mood.lower()} mood. "
        f"{transition_description}"
        f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. "
        f"Visualize {tempo_description} tempo. "  # Updated line
        f"Narrative based on the lyrics: '{lyrics_summary}'. "
        f"Emphasize the themes and emotions conveyed in the song."
    )

    return prompt

# Define Gradio interface
gr.Interface(
    fn=analyze_audio, 
    inputs=gr.Audio(type="filepath"),
    outputs=[gr.HTML(), Video()],
).launch()