Spaces:
Runtime error
Runtime error
File size: 6,472 Bytes
80b6d17 00a70f8 80b6d17 31b1379 ce3218c 31b1379 ce3218c 31b1379 ce3218c 80b6d17 8a709b9 80b6d17 8a709b9 80b6d17 8a709b9 80b6d17 8a709b9 31b1379 e33689b 8a709b9 80b6d17 a0a6f55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import gradio as gr
import librosa
import numpy as np
import requests
from gradio.outputs import Video
from video_generator import generate_video
def extract_lyrics(api_response):
words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"]
lyrics_with_timing = []
CHUNK_DURATION = 10
current_chunk = ""
current_chunk_start_time = 0
for word_info in words_timing:
word = word_info["word"]
start_time = word_info["start"]
if start_time >= current_chunk_start_time + CHUNK_DURATION:
end_time = word_info["end"]
lyrics_with_timing.append((current_chunk_start_time, end_time, current_chunk.strip()))
current_chunk = ""
current_chunk_start_time += CHUNK_DURATION
current_chunk += " " + word
lyrics_with_timing.append((current_chunk_start_time, words_timing[-1]["end"], current_chunk.strip()))
return lyrics_with_timing
def send_to_deepgram(audio_file_path):
# Update with your Deepgram API endpoint and key
endpoint = "https://api.deepgram.com/v1/listen"
headers = {
"Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7"
}
with open(audio_file_path, 'rb') as audio_file :
audio_data = audio_file.read()
response = requests.post(endpoint, headers=headers, data=audio_data)
response_json = response.json()
print("Deepgram API Response:", response_json) # Log the response
return response_json
def analyze_audio(audio_file_path):
print("Analyzing audio...") # Log start of analysis
last_frame = None
y, sr = librosa.load(audio_file_path)
chunk_length = 10 * sr # 10 seconds
moods = []
deepgram_response = send_to_deepgram(audio_file_path)
lyrics_chunks = extract_lyrics(deepgram_response)
for start in range(0, len(y), chunk_length):
chunk = y[start:start + chunk_length]
mood, _, _, _, _, _ = analyze_chunk(chunk)
moods.append(mood)
for i, start in enumerate(range(0, len(y), chunk_length)):
print(f"Analyzing chunk {i + 1}...") # Log chunk analysis
chunk = y[start:start + chunk_length]
lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence'
previous_mood = moods[i - 1] if i > 0 else None
current_mood = moods[i]
next_mood = moods[i + 1] if i < len(moods) - 1 else None
_, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk)
prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary)
description = f"Chunk starting at {start / sr} seconds:<br>Mood: {current_mood}<br>Video Prompt: {prompt}<br><br>"
print(f"Generating video for chunk {i + 1}...")
lyrics_with_timing = extract_lyrics(deepgram_response)
video = generate_video(lyrics_with_timing, last_frame)
#last_frame = extract_last_frame(video)
print(f"Description for chunk {i + 1}: {description}")
print(f"Video for chunk {i + 1}: {video}")
# Yield the result for this chunk
yield (description, video)
def analyze_chunk(chunk):
tempo, _ = librosa.beat.beat_track(y=chunk)
chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk))
spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk))
zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk))
mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk))
mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean)
return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean
def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean):
# Happy Mood
if tempo > 110 and chroma_mean > 0.4:
return 'Happy'
# Sad Mood
elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0:
return 'Sad'
# Energetic Mood
elif tempo > 130 and zero_crossing_rate_mean > 0.05:
return 'Energetic'
# Relaxed Mood
elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15:
return 'Relaxed'
# Romantic Mood
elif tempo < 100 and chroma_mean > 0.5:
return 'Romantic'
# Nostalgic Mood
elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25:
return 'Nostalgic'
# Tense Mood
elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20:
return 'Tense'
# Dreamy Mood
elif tempo < 80 and chroma_mean > 0.4:
return 'Dreamy'
# Aggressive Mood
elif tempo > 140 and zero_crossing_rate_mean > 0.08:
return 'Aggressive'
# Neutral Mood (Catch-all)
else:
return 'Neutral'
def describe_tempo(tempo):
if tempo < 60:
return "a very slow"
elif tempo < 90:
return "a slow"
elif tempo < 120:
return "a moderate"
elif tempo < 150:
return "a lively"
else:
return "a fast"
def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary):
rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm"
tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones"
spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts"
tempo_description = describe_tempo(tempo)
transition_description = ""
if previous_mood:
transition_description += f"Transition from a {previous_mood.lower()} mood. "
if next_mood:
transition_description += f"Prepare to transition to a {next_mood.lower()} mood. "
prompt = (
f"Essence of a {current_mood.lower()} mood. "
f"{transition_description}"
f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. "
f"Visualize {tempo_description} tempo. " # Updated line
f"Narrative based on the lyrics: '{lyrics_summary}'. "
f"Emphasize the themes and emotions conveyed in the song."
)
return prompt
# Define Gradio interface
gr.Interface(
fn=analyze_audio,
inputs=gr.Audio(type="filepath"),
outputs=[gr.HTML(), Video()],
).launch() |