antonelli commited on
Commit
80b6d17
·
1 Parent(s): dd71667

Add application file

Browse files
Files changed (3) hide show
  1. main.py +153 -0
  2. requirements.txt +6 -0
  3. video_generator.py +59 -0
main.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import requests
5
+ from gradio.outputs import Video
6
+
7
+ from video_generator import generate_video, extract_last_frame
8
+
9
+ def extract_lyrics(api_response):
10
+ try:
11
+ words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"]
12
+ lyrics_chunks = []
13
+ CHUNK_DURATION = 10
14
+ current_chunk = ""
15
+ current_chunk_start_time = 0
16
+ for word_info in words_timing:
17
+ word = word_info["word"]
18
+ start_time = word_info["start"]
19
+ if start_time >= current_chunk_start_time + CHUNK_DURATION:
20
+ lyrics_chunks.append(current_chunk.strip())
21
+ current_chunk = ""
22
+ current_chunk_start_time += CHUNK_DURATION
23
+ current_chunk += " " + word
24
+ lyrics_chunks.append(current_chunk.strip())
25
+ return lyrics_chunks
26
+ except KeyError:
27
+ print("Error in API response:", api_response)
28
+ return []
29
+
30
+ def send_to_deepgram(audio_file_path):
31
+ # Update with your Deepgram API endpoint and key
32
+ endpoint = "https://api.deepgram.com/v1/listen"
33
+ headers = {
34
+ "Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7"
35
+ }
36
+ with open(audio_file_path, 'rb') as audio_file :
37
+ audio_data = audio_file.read()
38
+
39
+ response = requests.post(endpoint, headers=headers, data=audio_data)
40
+ print(response.json()) # Print the response here
41
+ return response.json()
42
+
43
+ def analyze_audio(audio_file_path):
44
+ last_frame = None
45
+ y, sr = librosa.load(audio_file_path)
46
+ chunk_length = 10 * sr # 10 seconds
47
+ moods = []
48
+
49
+ deepgram_response = send_to_deepgram(audio_file_path)
50
+ lyrics_chunks = extract_lyrics(deepgram_response)
51
+
52
+ for start in range(0, len(y), chunk_length):
53
+ chunk = y[start:start + chunk_length]
54
+ mood, _, _, _, _, _ = analyze_chunk(chunk)
55
+ moods.append(mood)
56
+
57
+ for i, start in enumerate(range(0, len(y), chunk_length)):
58
+ chunk = y[start:start + chunk_length]
59
+ lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence'
60
+ previous_mood = moods[i - 1] if i > 0 else None
61
+ current_mood = moods[i]
62
+ next_mood = moods[i + 1] if i < len(moods) - 1 else None
63
+ _, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk)
64
+ prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary)
65
+ description = f"Chunk starting at {start / sr} seconds:<br>Mood: {current_mood}<br>Video Prompt: {prompt}<br><br>"
66
+ video = generate_video(prompt, last_frame)
67
+ last_frame = extract_last_frame(video)
68
+
69
+ # Yield the result for this chunk
70
+ yield (description, video)
71
+
72
+ def analyze_chunk(chunk):
73
+ tempo, _ = librosa.beat.beat_track(y=chunk)
74
+ chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk))
75
+ spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk))
76
+ zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk))
77
+ mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk))
78
+ mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean)
79
+ return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean
80
+
81
+ def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean):
82
+ # Happy Mood
83
+ if tempo > 110 and chroma_mean > 0.4:
84
+ return 'Happy'
85
+ # Sad Mood
86
+ elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0:
87
+ return 'Sad'
88
+ # Energetic Mood
89
+ elif tempo > 130 and zero_crossing_rate_mean > 0.05:
90
+ return 'Energetic'
91
+ # Relaxed Mood
92
+ elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15:
93
+ return 'Relaxed'
94
+ # Romantic Mood
95
+ elif tempo < 100 and chroma_mean > 0.5:
96
+ return 'Romantic'
97
+ # Nostalgic Mood
98
+ elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25:
99
+ return 'Nostalgic'
100
+ # Tense Mood
101
+ elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20:
102
+ return 'Tense'
103
+ # Dreamy Mood
104
+ elif tempo < 80 and chroma_mean > 0.4:
105
+ return 'Dreamy'
106
+ # Aggressive Mood
107
+ elif tempo > 140 and zero_crossing_rate_mean > 0.08:
108
+ return 'Aggressive'
109
+ # Neutral Mood (Catch-all)
110
+ else:
111
+ return 'Neutral'
112
+
113
+ def describe_tempo(tempo):
114
+ if tempo < 60:
115
+ return "a very slow"
116
+ elif tempo < 90:
117
+ return "a slow"
118
+ elif tempo < 120:
119
+ return "a moderate"
120
+ elif tempo < 150:
121
+ return "a lively"
122
+ else:
123
+ return "a fast"
124
+
125
+ def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary):
126
+ rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm"
127
+ tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones"
128
+ spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts"
129
+ tempo_description = describe_tempo(tempo)
130
+
131
+ transition_description = ""
132
+ if previous_mood:
133
+ transition_description += f"Transition from a {previous_mood.lower()} mood. "
134
+ if next_mood:
135
+ transition_description += f"Prepare to transition to a {next_mood.lower()} mood. "
136
+
137
+ prompt = (
138
+ f"Essence of a {current_mood.lower()} mood. "
139
+ f"{transition_description}"
140
+ f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. "
141
+ f"Visualize {tempo_description} tempo. " # Updated line
142
+ f"Narrative based on the lyrics: '{lyrics_summary}'. "
143
+ f"Emphasize the themes and emotions conveyed in the song."
144
+ )
145
+
146
+ return prompt
147
+
148
+ # Define Gradio interface
149
+ gr.Interface(
150
+ fn=analyze_audio,
151
+ inputs=gr.Audio(type="filepath"),
152
+ outputs=[gr.HTML(), Video()],
153
+ ).launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ librosa
3
+ numpy
4
+ requests
5
+ stability-sdk
6
+ opencv-python-headless
video_generator.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from stability_sdk import api
2
+ from stability_sdk.animation import AnimationArgs, Animator
3
+
4
+ STABILITY_HOST = "grpc.stability.ai:443"
5
+ STABILITY_KEY = "sk-7ApKEBZKcV1uOPQTWLyihtqaBXxV7iRdbM6rNlxRZSEPHzaj" # API key from https://platform.stability.ai/account/keys
6
+ context = api.Context(STABILITY_HOST, STABILITY_KEY)
7
+
8
+ def generate_video(prompt, last_frame_image, max_frames=48):
9
+ # Configure the animation
10
+ args = AnimationArgs()
11
+ args.interpolate_prompts = True
12
+ args.locked_seed = True
13
+ args.max_frames = max_frames
14
+ args.seed = 42
15
+ args.strength_curve = "0:(0)"
16
+ args.diffusion_cadence_curve = "0:(4)"
17
+ args.cadence_interp = "film"
18
+
19
+ animation_prompts = {
20
+ 0: prompt,
21
+ max_frames//2: prompt # Repeat the same prompt for the middle frame
22
+ }
23
+ negative_prompt = ""
24
+
25
+ # Create Animator object to orchestrate the rendering
26
+ animator = Animator(
27
+ api_context=context,
28
+ animation_prompts=animation_prompts,
29
+ negative_prompt=negative_prompt,
30
+ args=args
31
+ )
32
+
33
+ frames = []
34
+ # Render each frame of animation
35
+ for idx, frame in enumerate(animator.render()):
36
+ frame_path = f"frame_{idx:05d}.png"
37
+ frame.save(frame_path)
38
+ frames.append(frame_path)
39
+
40
+ # Combine frames into a video (replace with your preferred method)
41
+ video = combine_frames_into_video(frames)
42
+
43
+ return video
44
+
45
+ def extract_last_frame(video):
46
+ # Extract the last frame from the video (replace with your preferred method)
47
+ last_frame_image = extract_last_frame_from_video(video)
48
+
49
+ return last_frame_image
50
+
51
+ def combine_frames_into_video(frames):
52
+ # Your code to combine the frames into a video
53
+ video = None # Replace with actual video object
54
+ return video
55
+
56
+ def extract_last_frame_from_video(video):
57
+ # Your code to extract the last frame from the video
58
+ last_frame_image = None # Replace with actual image object
59
+ return last_frame_image