Spaces:

PeepDaSlan9
/

animated-audio-visualizer

Paused

App Files Files Community

animated-audio-visualizer / app.py

PeepDaSlan9

Duplicate from fffiloni/animated-audio-visualizer

9fc5fc1 over 2 years ago

raw

history blame contribute delete

7.13 kB

	import gradio as gr
	import matplotlib.pyplot as plt
	import librosa
	import numpy as np
	from PIL import Image, ImageDraw, ImageFont
	from moviepy.editor import *
	from moviepy.video.io.VideoFileClip import VideoFileClip

	def make_bars_image(height_values, index, new_height):

	# Define the size of the image
	width = 512
	height = new_height

	# Create a new image with a transparent background
	image = Image.new('RGBA', (width, height), color=(0, 0, 0, 0))

	# Get the image drawing context
	draw = ImageDraw.Draw(image)

	# Define the rectangle width and spacing
	rect_width = 2
	spacing = 2

	# Define the list of height values for the rectangles
	#height_values = [20, 40, 60, 80, 100, 80, 60, 40]
	num_bars = len(height_values)
	# Calculate the total width of the rectangles and the spacing
	total_width = num_bars * rect_width + (num_bars - 1) * spacing

	# Calculate the starting position for the first rectangle
	start_x = int((width - total_width) / 2)
	# Define the buffer size
	buffer_size = 80
	# Draw the rectangles from left to right
	x = start_x
	for i, height in enumerate(height_values):

	# Define the rectangle coordinates
	y0 = buffer_size
	y1 = height + buffer_size
	x0 = x
	x1 = x + rect_width

	# Draw the rectangle
	draw.rectangle([x0, y0, x1, y1], fill='white')

	# Move to the next rectangle position
	if i < num_bars - 1:
	x += rect_width + spacing


	# Rotate the image by 180 degrees
	image = image.rotate(180)

	# Mirror the image
	image = image.transpose(Image.FLIP_LEFT_RIGHT)

	# Save the image
	image.save('audio_bars_'+ str(index) + '.png')

	return 'audio_bars_'+ str(index) + '.png'

	def db_to_height(db_value):
	# Scale the dB value to a range between 0 and 1
	scaled_value = (db_value + 80) / 80

	# Convert the scaled value to a height between 0 and 100
	height = scaled_value * 50

	return height

	def infer(title, audio_in, image_in, output_video_path):
	# Load the audio file
	audio_path = audio_in
	audio_data, sr = librosa.load(audio_path)

	# Get the duration in seconds
	duration = librosa.get_duration(y=audio_data, sr=sr)

	# Extract the audio data for the desired time
	start_time = 0 # start time in seconds
	end_time = duration # end time in seconds

	start_index = int(start_time * sr)
	end_index = int(end_time * sr)

	audio_data = audio_data[start_index:end_index]

	# Compute the short-time Fourier transform
	hop_length = 512


	stft = librosa.stft(audio_data, hop_length=hop_length)
	spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max)

	# Get the frequency values
	freqs = librosa.fft_frequencies(sr=sr, n_fft=stft.shape[0])

	# Select the indices of the frequency values that correspond to the desired frequencies
	n_freqs = 114
	freq_indices = np.linspace(0, len(freqs) - 1, n_freqs, dtype=int)

	# Extract the dB values for the desired frequencies
	db_values = []
	for i in range(spectrogram.shape[1]):
	db_values.append(list(zip(freqs[freq_indices], spectrogram[freq_indices, i])))

	# Print the dB values for the first time frame
	print(db_values[0])

	proportional_values = []

	for frame in db_values:
	proportional_frame = [db_to_height(db) for f, db in frame]
	proportional_values.append(proportional_frame)

	print(proportional_values[0])
	print("AUDIO CHUNK: " + str(len(proportional_values)))

	# Open the background image
	background_image = Image.open(image_in)

	# Resize the image while keeping its aspect ratio
	bg_width, bg_height = background_image.size
	aspect_ratio = bg_width / bg_height
	new_width = 512
	new_height = int(new_width / aspect_ratio)
	resized_bg = background_image.resize((new_width, new_height))

	# Apply black cache for better visibility of the white text
	bg_cache = Image.open('black_cache.png')
	resized_bg.paste(bg_cache, (0, resized_bg.height - bg_cache.height), mask=bg_cache)

	# Create a new ImageDraw object
	draw = ImageDraw.Draw(resized_bg)

	# Define the text to be added
	text = title
	font = ImageFont.truetype("Lato-Regular.ttf", 16)
	text_color = (255, 255, 255) # white color

	# Calculate the position of the text
	#text_width, text_height = draw.textsize(text, font=font)
	x = 30
	y = new_height - 70

	# Draw the text on the image
	draw.text((x, y), text, fill=text_color, font=font)

	# Save the resized image
	resized_bg.save('resized_background.jpg')

	generated_frames = []
	for i, frame in enumerate(proportional_values):
	bars_img = make_bars_image(frame, i, new_height)
	bars_img = Image.open(bars_img)
	# Paste the audio bars image on top of the background image
	fresh_bg = Image.open('resized_background.jpg')
	fresh_bg.paste(bars_img, (0, 0), mask=bars_img)
	# Save the image
	fresh_bg.save('audio_bars_with_bg' + str(i) + '.jpg')
	generated_frames.append('audio_bars_with_bg' + str(i) + '.jpg')
	print(generated_frames)

	# Create a video clip from the images
	clip = ImageSequenceClip(generated_frames, fps=len(generated_frames)/(end_time-start_time))
	audio_clip = AudioFileClip(audio_in)
	clip = clip.set_audio(audio_clip)
	# Set the output codec
	codec = 'libx264'
	audio_codec = 'aac'
	# Save the video to a file
	clip.write_videofile("my_video.mp4", codec=codec, audio_codec=audio_codec)

	retimed_clip = VideoFileClip("my_video.mp4")

	# Set the desired frame rate
	new_fps = 25

	# Create a new clip with the new frame rate
	new_clip = retimed_clip.set_fps(new_fps)

	# Save the new clip as a new video file
	new_clip.write_videofile(output_video_path, codec=codec, audio_codec=audio_codec)

	# Visualize the audio bars
	plt.figure(figsize=(10, 4))
	librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='log')
	plt.colorbar(format='%+2.0f dB')
	plt.title('Audio Bars Visualization')

	# Save the image as a JPG file
	output_path = 'image_out.jpg'
	plt.savefig(output_path, dpi=300, bbox_inches='tight')

	#test make image bars
	#bars_img = make_bars_image(proportional_values[0])
	return output_video_path, 'image_out.jpg'

	gr.Interface(fn=infer,
	inputs=[gr.Textbox(placeholder='FIND A GOOD TITLE'),
	gr.Audio(source='upload', type='filepath'),
	gr.Image(source='upload', type='filepath'),
	gr.Textbox(label="Output video path", value="my_final_video.mp4", visible=False)],
	outputs=[gr.Video(label='video result'), gr.Image(label='spectrogram image')],
	title='Animated Audio Visualizer', description='<p style="text-align: center;">Upload an audio file, upload a background image, choose a good title, click submit.</p>').launch()