Spaces:

krishangupta33
/

Voice-Blend_Alchemy-AI

Sleeping

App Files Files

Voice-Blend_Alchemy-AI / code_v2.py

krishangupta33

Update code_v2.py

29f6f2a verified almost 2 years ago

raw

history blame

10.9 kB

	from threading import Thread
	from pathlib import Path
	import gradio as gr
	import subprocess
	import shutil
	import time
	import copy
	import glob
	import json
	import os
	import shlex
	import numpy as np
	import librosa
	from scipy.io import wavfile
	import openai
	from openai import OpenAI
	from dotenv import load_dotenv
	from PIL import Image
	import requests
	from io import BytesIO

	load_dotenv()

	OpenAI.api_key = os.getenv('OPENAI_API_KEY')


	client = OpenAI(
	)

	# Paths according to your directory structure
	CURRENT_DIR = Path(__file__).resolve().parent
	PEOPLE_DIR = CURRENT_DIR / "people"
	SONGS_DIR = CURRENT_DIR / "songs"
	INFERENCE_OUTPUT_DIRNAME = CURRENT_DIR / "inference_output"
	COVER_IMAGE_PATH = SONGS_DIR / "cover.png"

	logo_image_path = os.path.abspath("Logo.png")


	# Ensure the inference output directory exists
	INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)

	def get_people():
	"""List the people available for voice conversion."""
	return sorted([p.stem for p in PEOPLE_DIR.iterdir() if p.is_dir()])

	def get_songs():
	"""List the song directories available for selection."""
	# Use a set to avoid adding duplicate folder names
	folder_names = set()
	# Iterate through all directories and subdirectories in SONGS_DIR
	for root, dirs, _ in os.walk(SONGS_DIR):
	for dir in dirs:
	# Construct the relative path from SONGS_DIR to the current directory
	relative_path = os.path.relpath(os.path.join(root, dir), SONGS_DIR)
	# Add the relative path of directories to the set
	folder_names.add(relative_path)
	# Return a sorted list of unique folder names
	return sorted(folder_names)


	def run_inference(speaker, path_str, f0_method, transpose, noise_scale, cluster_ratio):
	# Ensure path is a Path object
	path = Path(path_str)

	model_path = speaker["model_path"]
	config_path = speaker["cfg_path"]
	cluster_path = speaker["cluster_path"]
	output_path = Path(INFERENCE_OUTPUT_DIRNAME, path.name) # Now path is guaranteed to be a Path object
	cluster_args = f"-k \"{cluster_path}\" -r {cluster_ratio}" if cluster_path and cluster_ratio > 0 else ""

	# Ensure output directory exists
	INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)

	inference_cmd = f"svc infer \"{path.absolute()}\" -m \"{model_path}\" -c \"{config_path}\" {cluster_args} -t {transpose} --f0-method crepe -n 0.4 -o \"{output_path}\" --no-auto-predict-f0"
	command = shlex.split(inference_cmd)
	result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

	if result.returncode != 0:
	print(f"Command failed with return code {result.returncode}")
	print("STDERR:", result.stderr)
	return None, "⚠️ Error during inference."

	if "AttributeError" in result.stderr:
	return None, "⚠️ Modelo SVC incompatible."

	if not output_path.exists():
	print("Expected output file not found:", output_path)
	return None, "⚠️ Error: Output file not found."

	return str(output_path), None

	def get_speaker_details(speaker_name):
	speakers = [] # This should ideally be a list of dictionaries
	# Assuming MODELOS is a directory containing subdirectories for each speaker/model
	for _, dirs, _ in os.walk(PEOPLE_DIR):
	for dir_name in dirs:
	if dir_name.lower() == speaker_name.lower():
	speaker_path = PEOPLE_DIR / dir_name
	model_path = next(speaker_path.glob('G_*.pth'), None)
	description_path = speaker_path / "description.txt"
	image_path = next(speaker_path.glob('image.png'), None)
	cfg_path = next(speaker_path.glob('*.json'), None)
	if model_path and cfg_path:
	return {
	"model_path": model_path,
	"cfg_path": cfg_path,
	"description_path": description_path,
	"cluster_path": "",
	"image_path": image_path
	}
	return None

	def mix_audio(vocals_path, instrumentals_path, output_path):
	y_vocals, sr = librosa.load(vocals_path, sr=None)
	y_instrumentals, _ = librosa.load(instrumentals_path, sr=None)

	# Ensure both tracks are of the same length
	max_length = max(len(y_vocals), len(y_instrumentals))
	y_vocals_padded = np.pad(y_vocals, (0, max_length - len(y_vocals)), mode='constant')
	y_instrumentals_padded = np.pad(y_instrumentals, (0, max_length - len(y_instrumentals)), mode='constant')

	# Mix the two audio files
	mixed_audio = 0.5 * y_vocals_padded + 0.5 * y_instrumentals_padded

	# Save the mixed audio to a new WAV file
	wavfile.write(output_path, sr, (mixed_audio * 32767).astype(np.int16))

	def gen_caption_image(song_name, person):
	prompt = f"Create a creative and engaging Instagram caption for a post where {person} is performing the song '{song_name}'. Make it exciting and suitable for social media."

	response = client.chat.completions.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "user", "content": prompt}
	]
	)

	caption= response.choices[0].message.content

	image_prompt=f'draw an image for my instagram post having for the song {song_name} and the caption {caption}'

	responseimage = client.images.generate(
	model="dall-e-3",
	prompt=image_prompt,
	size="1024x1024",
	quality="standard",
	n=1,
	)
	image_url = responseimage.data[0].url

	image_response = requests.get(image_url)
	img = Image.open(BytesIO(image_response.content))


	return np.array(img) , caption

	def voice_conversion(person, song_selection):
	try:
	speaker_details = get_speaker_details(person)
	if not speaker_details:
	raise Exception("Speaker not found! Error: Speaker details not found.")

	# Extract the folder name from the song selection (ignoring the file part)
	folder_name = song_selection.split('\\')[0]

	vocals_path = SONGS_DIR / folder_name / "vocals.wav"
	instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"

	if not vocals_path.exists() or not instrumentals_path.exists():
	raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")

	converted_vocals_path, error = run_inference(speaker_details, str(vocals_path), 0, 0, 0.4, 0)
	if error:
	raise Exception(error)

	converted_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_converted_mix.wav")
	mix_audio(converted_vocals_path, str(instrumentals_path), converted_mix_path)

	return converted_mix_path
	except Exception as e:
	return None, None, str(e), ""

	def update_person_details(selected_person):
	speaker_details = get_speaker_details(selected_person)
	image_path = str(speaker_details["image_path"])
	# Return the image path in a list as expected by the Gallery component
	with open(speaker_details["description_path"], 'r') as file:
	person_description = file.read()
	return image_path, person_description



	def update_song_features(song_selection):
	cover_image_path = COVER_IMAGE_PATH
	folder_name = song_selection.split('\\')[0]
	vocals_path = SONGS_DIR / folder_name / "vocals.wav"
	instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"
	if not vocals_path.exists() or not instrumentals_path.exists():
	raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")

	original_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_original_mix.wav")
	mix_audio(str(vocals_path), str(instrumentals_path), original_mix_path)

	return original_mix_path, cover_image_path

	# Updating the interface setup to include the event listener

	# CSS for artistic theme
	# CSS for artistic theme and custom positioning
	css = """
	@import url('https://fonts.googleapis.com/css2?family=Pacifico&display=swap');
	.center-container {
	display: flex;
	justify-content: center;
	align-items: center;
	}
	.right-container {
	display: flex;
	justify-content: flex-end;
	align-items: center;
	}
	/* Additional custom CSS for spacing and alignment */
	.logo-img {
	max-width: 100%;
	height: auto;
	display: block;
	margin-left: auto;
	margin-right: auto;
	}
	.alchemy-img {
	float: right;
	max-height: 60px; /* Adjust based on your preference */
	}
	"""

	header_markdown = """
	<div style="text-align: center; font-weight: bold; font-size: 18px;">
	<span style="margin-right: 20px;">TECHNATION</span>
	<span style="font-size: 24px;">VoiceBlend by Alchemy AI</span>
	<span style="margin-left: 20px;">McGill University</span>
	</div>
	"""

	# Gradio app with custom CSS and theme
	with gr.Blocks(css=css) as app:
	with gr.Row():
	# gr.Image(logo_image_path, container=False, show_label=False, show_download_button=False, height=100, width=100)
	gr.Markdown(header_markdown)



	with gr.Row():
	with gr.Column():
	gr.Markdown("#### Step 1: Pick Your Track 🎵")
	song_dropdown = gr.Dropdown(label="Pick one of 5 songs:", choices=get_songs())
	album_image = gr.Image(label="Cover Image", height=300)
	original_mix_audio = gr.Audio(label="Original Mix (Vocals + Instrumentals)", interactive=False)
	song_dropdown.change(fn=update_song_features, inputs=song_dropdown, outputs=[original_mix_audio, album_image])

	with gr.Column():
	gr.Markdown("#### Step 2: Select Your Voice to Emulate 👩🏻")
	person_dropdown = gr.Dropdown(label="Select one of 15 voices:", choices=get_people())
	person_image = gr.Image(label="Person", height=300)
	person_description_text = gr.Textbox(label="Description:")
	person_dropdown.change(update_person_details, inputs=person_dropdown, outputs=[person_image, person_description_text])

	convert_button = gr.Button("Convert")

	with gr.Column():
	gr.Markdown("#### Your Custom AI Vocal - Unveiled! 🤖")
	converted_mix_audio = gr.Audio(label="Converted Mix (Converted Vocals + Instrumentals)", type="filepath")
	convert_button.click(voice_conversion, inputs=[person_dropdown, song_dropdown], outputs=[converted_mix_audio])

	generate_button = gr.Button("Generate Social Media Post")
	generate_button.click(gen_caption_image, inputs=[song_dropdown, person_dropdown], outputs=[gr.Image(label="Generated Image", type="numpy", height=300), gr.Textbox(label="Generated Caption")])

	if __name__ == "__main__":
	app.launch(share = True)