|
|
import gradio as gr |
|
|
import matplotlib.pyplot as plt |
|
|
import librosa |
|
|
import numpy as np |
|
|
from PIL import Image, ImageDraw, ImageFont |
|
|
from moviepy.editor import * |
|
|
from moviepy.video.io.VideoFileClip import VideoFileClip |
|
|
|
|
|
def make_bars_image(height_values, index, new_height): |
|
|
|
|
|
|
|
|
width = 512 |
|
|
height = new_height |
|
|
|
|
|
|
|
|
image = Image.new('RGBA', (width, height), color=(0, 0, 0, 0)) |
|
|
|
|
|
|
|
|
draw = ImageDraw.Draw(image) |
|
|
|
|
|
|
|
|
rect_width = 2 |
|
|
spacing = 2 |
|
|
|
|
|
|
|
|
|
|
|
num_bars = len(height_values) |
|
|
|
|
|
total_width = num_bars * rect_width + (num_bars - 1) * spacing |
|
|
|
|
|
|
|
|
start_x = int((width - total_width) / 2) |
|
|
|
|
|
buffer_size = 80 |
|
|
|
|
|
x = start_x |
|
|
for i, height in enumerate(height_values): |
|
|
|
|
|
|
|
|
y0 = buffer_size |
|
|
y1 = height + buffer_size |
|
|
x0 = x |
|
|
x1 = x + rect_width |
|
|
|
|
|
|
|
|
draw.rectangle([x0, y0, x1, y1], fill='white') |
|
|
|
|
|
|
|
|
if i < num_bars - 1: |
|
|
x += rect_width + spacing |
|
|
|
|
|
|
|
|
|
|
|
image = image.rotate(180) |
|
|
|
|
|
|
|
|
image = image.transpose(Image.FLIP_LEFT_RIGHT) |
|
|
|
|
|
|
|
|
image.save('audio_bars_'+ str(index) + '.png') |
|
|
|
|
|
return 'audio_bars_'+ str(index) + '.png' |
|
|
|
|
|
def db_to_height(db_value): |
|
|
|
|
|
scaled_value = (db_value + 80) / 80 |
|
|
|
|
|
|
|
|
height = scaled_value * 50 |
|
|
|
|
|
return height |
|
|
|
|
|
def infer(title, audio_in, image_in, output_video_path): |
|
|
|
|
|
audio_path = audio_in |
|
|
audio_data, sr = librosa.load(audio_path) |
|
|
|
|
|
|
|
|
duration = librosa.get_duration(y=audio_data, sr=sr) |
|
|
|
|
|
|
|
|
start_time = 0 |
|
|
end_time = duration |
|
|
|
|
|
start_index = int(start_time * sr) |
|
|
end_index = int(end_time * sr) |
|
|
|
|
|
audio_data = audio_data[start_index:end_index] |
|
|
|
|
|
|
|
|
hop_length = 512 |
|
|
|
|
|
|
|
|
stft = librosa.stft(audio_data, hop_length=hop_length) |
|
|
spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max) |
|
|
|
|
|
|
|
|
freqs = librosa.fft_frequencies(sr=sr, n_fft=stft.shape[0]) |
|
|
|
|
|
|
|
|
n_freqs = 114 |
|
|
freq_indices = np.linspace(0, len(freqs) - 1, n_freqs, dtype=int) |
|
|
|
|
|
|
|
|
db_values = [] |
|
|
for i in range(spectrogram.shape[1]): |
|
|
db_values.append(list(zip(freqs[freq_indices], spectrogram[freq_indices, i]))) |
|
|
|
|
|
|
|
|
print(db_values[0]) |
|
|
|
|
|
proportional_values = [] |
|
|
|
|
|
for frame in db_values: |
|
|
proportional_frame = [db_to_height(db) for f, db in frame] |
|
|
proportional_values.append(proportional_frame) |
|
|
|
|
|
print(proportional_values[0]) |
|
|
print("AUDIO CHUNK: " + str(len(proportional_values))) |
|
|
|
|
|
|
|
|
background_image = Image.open(image_in) |
|
|
|
|
|
|
|
|
bg_width, bg_height = background_image.size |
|
|
aspect_ratio = bg_width / bg_height |
|
|
new_width = 512 |
|
|
new_height = int(new_width / aspect_ratio) |
|
|
resized_bg = background_image.resize((new_width, new_height)) |
|
|
|
|
|
|
|
|
bg_cache = Image.open('black_cache.png') |
|
|
resized_bg.paste(bg_cache, (0, resized_bg.height - bg_cache.height), mask=bg_cache) |
|
|
|
|
|
|
|
|
draw = ImageDraw.Draw(resized_bg) |
|
|
|
|
|
|
|
|
text = title |
|
|
font = ImageFont.truetype("Lato-Regular.ttf", 16) |
|
|
text_color = (255, 255, 255) |
|
|
|
|
|
|
|
|
|
|
|
x = 30 |
|
|
y = new_height - 70 |
|
|
|
|
|
|
|
|
draw.text((x, y), text, fill=text_color, font=font) |
|
|
|
|
|
|
|
|
resized_bg.save('resized_background.jpg') |
|
|
|
|
|
generated_frames = [] |
|
|
for i, frame in enumerate(proportional_values): |
|
|
bars_img = make_bars_image(frame, i, new_height) |
|
|
bars_img = Image.open(bars_img) |
|
|
|
|
|
fresh_bg = Image.open('resized_background.jpg') |
|
|
fresh_bg.paste(bars_img, (0, 0), mask=bars_img) |
|
|
|
|
|
fresh_bg.save('audio_bars_with_bg' + str(i) + '.jpg') |
|
|
generated_frames.append('audio_bars_with_bg' + str(i) + '.jpg') |
|
|
print(generated_frames) |
|
|
|
|
|
|
|
|
clip = ImageSequenceClip(generated_frames, fps=len(generated_frames)/(end_time-start_time)) |
|
|
audio_clip = AudioFileClip(audio_in) |
|
|
clip = clip.set_audio(audio_clip) |
|
|
|
|
|
codec = 'libx264' |
|
|
audio_codec = 'aac' |
|
|
|
|
|
clip.write_videofile("my_video.mp4", codec=codec, audio_codec=audio_codec) |
|
|
|
|
|
retimed_clip = VideoFileClip("my_video.mp4") |
|
|
|
|
|
|
|
|
new_fps = 25 |
|
|
|
|
|
|
|
|
new_clip = retimed_clip.set_fps(new_fps) |
|
|
|
|
|
|
|
|
new_clip.write_videofile(output_video_path, codec=codec, audio_codec=audio_codec) |
|
|
|
|
|
|
|
|
plt.figure(figsize=(10, 4)) |
|
|
librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='log') |
|
|
plt.colorbar(format='%+2.0f dB') |
|
|
plt.title('Audio Bars Visualization') |
|
|
|
|
|
|
|
|
output_path = 'image_out.jpg' |
|
|
plt.savefig(output_path, dpi=300, bbox_inches='tight') |
|
|
|
|
|
|
|
|
|
|
|
return output_video_path, 'image_out.jpg' |
|
|
|
|
|
gr.Interface(fn=infer, |
|
|
inputs=[gr.Textbox(placeholder='FIND A GOOD TITLE'), |
|
|
gr.Audio(source='upload', type='filepath'), |
|
|
gr.Image(source='upload', type='filepath'), |
|
|
gr.Textbox(label="Output video path", value="my_final_video.mp4", visible=False)], |
|
|
outputs=[gr.Video(label='video result'), gr.Image(label='spectrogram image')], |
|
|
title='Animated Audio Visualizer', description='<p style="text-align: center;">Upload an audio file, upload a background image, choose a good title, click submit.</p>').launch() |