Spaces:
Runtime error
Runtime error
Add application file
Browse files- main.py +153 -0
- requirements.txt +6 -0
- video_generator.py +59 -0
main.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import librosa
|
| 3 |
+
import numpy as np
|
| 4 |
+
import requests
|
| 5 |
+
from gradio.outputs import Video
|
| 6 |
+
|
| 7 |
+
from video_generator import generate_video, extract_last_frame
|
| 8 |
+
|
| 9 |
+
def extract_lyrics(api_response):
|
| 10 |
+
try:
|
| 11 |
+
words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"]
|
| 12 |
+
lyrics_chunks = []
|
| 13 |
+
CHUNK_DURATION = 10
|
| 14 |
+
current_chunk = ""
|
| 15 |
+
current_chunk_start_time = 0
|
| 16 |
+
for word_info in words_timing:
|
| 17 |
+
word = word_info["word"]
|
| 18 |
+
start_time = word_info["start"]
|
| 19 |
+
if start_time >= current_chunk_start_time + CHUNK_DURATION:
|
| 20 |
+
lyrics_chunks.append(current_chunk.strip())
|
| 21 |
+
current_chunk = ""
|
| 22 |
+
current_chunk_start_time += CHUNK_DURATION
|
| 23 |
+
current_chunk += " " + word
|
| 24 |
+
lyrics_chunks.append(current_chunk.strip())
|
| 25 |
+
return lyrics_chunks
|
| 26 |
+
except KeyError:
|
| 27 |
+
print("Error in API response:", api_response)
|
| 28 |
+
return []
|
| 29 |
+
|
| 30 |
+
def send_to_deepgram(audio_file_path):
|
| 31 |
+
# Update with your Deepgram API endpoint and key
|
| 32 |
+
endpoint = "https://api.deepgram.com/v1/listen"
|
| 33 |
+
headers = {
|
| 34 |
+
"Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7"
|
| 35 |
+
}
|
| 36 |
+
with open(audio_file_path, 'rb') as audio_file :
|
| 37 |
+
audio_data = audio_file.read()
|
| 38 |
+
|
| 39 |
+
response = requests.post(endpoint, headers=headers, data=audio_data)
|
| 40 |
+
print(response.json()) # Print the response here
|
| 41 |
+
return response.json()
|
| 42 |
+
|
| 43 |
+
def analyze_audio(audio_file_path):
|
| 44 |
+
last_frame = None
|
| 45 |
+
y, sr = librosa.load(audio_file_path)
|
| 46 |
+
chunk_length = 10 * sr # 10 seconds
|
| 47 |
+
moods = []
|
| 48 |
+
|
| 49 |
+
deepgram_response = send_to_deepgram(audio_file_path)
|
| 50 |
+
lyrics_chunks = extract_lyrics(deepgram_response)
|
| 51 |
+
|
| 52 |
+
for start in range(0, len(y), chunk_length):
|
| 53 |
+
chunk = y[start:start + chunk_length]
|
| 54 |
+
mood, _, _, _, _, _ = analyze_chunk(chunk)
|
| 55 |
+
moods.append(mood)
|
| 56 |
+
|
| 57 |
+
for i, start in enumerate(range(0, len(y), chunk_length)):
|
| 58 |
+
chunk = y[start:start + chunk_length]
|
| 59 |
+
lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence'
|
| 60 |
+
previous_mood = moods[i - 1] if i > 0 else None
|
| 61 |
+
current_mood = moods[i]
|
| 62 |
+
next_mood = moods[i + 1] if i < len(moods) - 1 else None
|
| 63 |
+
_, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk)
|
| 64 |
+
prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary)
|
| 65 |
+
description = f"Chunk starting at {start / sr} seconds:<br>Mood: {current_mood}<br>Video Prompt: {prompt}<br><br>"
|
| 66 |
+
video = generate_video(prompt, last_frame)
|
| 67 |
+
last_frame = extract_last_frame(video)
|
| 68 |
+
|
| 69 |
+
# Yield the result for this chunk
|
| 70 |
+
yield (description, video)
|
| 71 |
+
|
| 72 |
+
def analyze_chunk(chunk):
|
| 73 |
+
tempo, _ = librosa.beat.beat_track(y=chunk)
|
| 74 |
+
chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk))
|
| 75 |
+
spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk))
|
| 76 |
+
zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk))
|
| 77 |
+
mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk))
|
| 78 |
+
mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean)
|
| 79 |
+
return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean
|
| 80 |
+
|
| 81 |
+
def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean):
|
| 82 |
+
# Happy Mood
|
| 83 |
+
if tempo > 110 and chroma_mean > 0.4:
|
| 84 |
+
return 'Happy'
|
| 85 |
+
# Sad Mood
|
| 86 |
+
elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0:
|
| 87 |
+
return 'Sad'
|
| 88 |
+
# Energetic Mood
|
| 89 |
+
elif tempo > 130 and zero_crossing_rate_mean > 0.05:
|
| 90 |
+
return 'Energetic'
|
| 91 |
+
# Relaxed Mood
|
| 92 |
+
elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15:
|
| 93 |
+
return 'Relaxed'
|
| 94 |
+
# Romantic Mood
|
| 95 |
+
elif tempo < 100 and chroma_mean > 0.5:
|
| 96 |
+
return 'Romantic'
|
| 97 |
+
# Nostalgic Mood
|
| 98 |
+
elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25:
|
| 99 |
+
return 'Nostalgic'
|
| 100 |
+
# Tense Mood
|
| 101 |
+
elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20:
|
| 102 |
+
return 'Tense'
|
| 103 |
+
# Dreamy Mood
|
| 104 |
+
elif tempo < 80 and chroma_mean > 0.4:
|
| 105 |
+
return 'Dreamy'
|
| 106 |
+
# Aggressive Mood
|
| 107 |
+
elif tempo > 140 and zero_crossing_rate_mean > 0.08:
|
| 108 |
+
return 'Aggressive'
|
| 109 |
+
# Neutral Mood (Catch-all)
|
| 110 |
+
else:
|
| 111 |
+
return 'Neutral'
|
| 112 |
+
|
| 113 |
+
def describe_tempo(tempo):
|
| 114 |
+
if tempo < 60:
|
| 115 |
+
return "a very slow"
|
| 116 |
+
elif tempo < 90:
|
| 117 |
+
return "a slow"
|
| 118 |
+
elif tempo < 120:
|
| 119 |
+
return "a moderate"
|
| 120 |
+
elif tempo < 150:
|
| 121 |
+
return "a lively"
|
| 122 |
+
else:
|
| 123 |
+
return "a fast"
|
| 124 |
+
|
| 125 |
+
def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary):
|
| 126 |
+
rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm"
|
| 127 |
+
tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones"
|
| 128 |
+
spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts"
|
| 129 |
+
tempo_description = describe_tempo(tempo)
|
| 130 |
+
|
| 131 |
+
transition_description = ""
|
| 132 |
+
if previous_mood:
|
| 133 |
+
transition_description += f"Transition from a {previous_mood.lower()} mood. "
|
| 134 |
+
if next_mood:
|
| 135 |
+
transition_description += f"Prepare to transition to a {next_mood.lower()} mood. "
|
| 136 |
+
|
| 137 |
+
prompt = (
|
| 138 |
+
f"Essence of a {current_mood.lower()} mood. "
|
| 139 |
+
f"{transition_description}"
|
| 140 |
+
f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. "
|
| 141 |
+
f"Visualize {tempo_description} tempo. " # Updated line
|
| 142 |
+
f"Narrative based on the lyrics: '{lyrics_summary}'. "
|
| 143 |
+
f"Emphasize the themes and emotions conveyed in the song."
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
return prompt
|
| 147 |
+
|
| 148 |
+
# Define Gradio interface
|
| 149 |
+
gr.Interface(
|
| 150 |
+
fn=analyze_audio,
|
| 151 |
+
inputs=gr.Audio(type="filepath"),
|
| 152 |
+
outputs=[gr.HTML(), Video()],
|
| 153 |
+
).launch(share=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
librosa
|
| 3 |
+
numpy
|
| 4 |
+
requests
|
| 5 |
+
stability-sdk
|
| 6 |
+
opencv-python-headless
|
video_generator.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from stability_sdk import api
|
| 2 |
+
from stability_sdk.animation import AnimationArgs, Animator
|
| 3 |
+
|
| 4 |
+
STABILITY_HOST = "grpc.stability.ai:443"
|
| 5 |
+
STABILITY_KEY = "sk-7ApKEBZKcV1uOPQTWLyihtqaBXxV7iRdbM6rNlxRZSEPHzaj" # API key from https://platform.stability.ai/account/keys
|
| 6 |
+
context = api.Context(STABILITY_HOST, STABILITY_KEY)
|
| 7 |
+
|
| 8 |
+
def generate_video(prompt, last_frame_image, max_frames=48):
|
| 9 |
+
# Configure the animation
|
| 10 |
+
args = AnimationArgs()
|
| 11 |
+
args.interpolate_prompts = True
|
| 12 |
+
args.locked_seed = True
|
| 13 |
+
args.max_frames = max_frames
|
| 14 |
+
args.seed = 42
|
| 15 |
+
args.strength_curve = "0:(0)"
|
| 16 |
+
args.diffusion_cadence_curve = "0:(4)"
|
| 17 |
+
args.cadence_interp = "film"
|
| 18 |
+
|
| 19 |
+
animation_prompts = {
|
| 20 |
+
0: prompt,
|
| 21 |
+
max_frames//2: prompt # Repeat the same prompt for the middle frame
|
| 22 |
+
}
|
| 23 |
+
negative_prompt = ""
|
| 24 |
+
|
| 25 |
+
# Create Animator object to orchestrate the rendering
|
| 26 |
+
animator = Animator(
|
| 27 |
+
api_context=context,
|
| 28 |
+
animation_prompts=animation_prompts,
|
| 29 |
+
negative_prompt=negative_prompt,
|
| 30 |
+
args=args
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
frames = []
|
| 34 |
+
# Render each frame of animation
|
| 35 |
+
for idx, frame in enumerate(animator.render()):
|
| 36 |
+
frame_path = f"frame_{idx:05d}.png"
|
| 37 |
+
frame.save(frame_path)
|
| 38 |
+
frames.append(frame_path)
|
| 39 |
+
|
| 40 |
+
# Combine frames into a video (replace with your preferred method)
|
| 41 |
+
video = combine_frames_into_video(frames)
|
| 42 |
+
|
| 43 |
+
return video
|
| 44 |
+
|
| 45 |
+
def extract_last_frame(video):
|
| 46 |
+
# Extract the last frame from the video (replace with your preferred method)
|
| 47 |
+
last_frame_image = extract_last_frame_from_video(video)
|
| 48 |
+
|
| 49 |
+
return last_frame_image
|
| 50 |
+
|
| 51 |
+
def combine_frames_into_video(frames):
|
| 52 |
+
# Your code to combine the frames into a video
|
| 53 |
+
video = None # Replace with actual video object
|
| 54 |
+
return video
|
| 55 |
+
|
| 56 |
+
def extract_last_frame_from_video(video):
|
| 57 |
+
# Your code to extract the last frame from the video
|
| 58 |
+
last_frame_image = None # Replace with actual image object
|
| 59 |
+
return last_frame_image
|