krishangupta33's picture
Update code_v2.py
29f6f2a verified
raw
history blame
10.9 kB
from threading import Thread
from pathlib import Path
import gradio as gr
import subprocess
import shutil
import time
import copy
import glob
import json
import os
import shlex
import numpy as np
import librosa
from scipy.io import wavfile
import openai
from openai import OpenAI
from dotenv import load_dotenv
from PIL import Image
import requests
from io import BytesIO
load_dotenv()
OpenAI.api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(
)
# Paths according to your directory structure
CURRENT_DIR = Path(__file__).resolve().parent
PEOPLE_DIR = CURRENT_DIR / "people"
SONGS_DIR = CURRENT_DIR / "songs"
INFERENCE_OUTPUT_DIRNAME = CURRENT_DIR / "inference_output"
COVER_IMAGE_PATH = SONGS_DIR / "cover.png"
logo_image_path = os.path.abspath("Logo.png")
# Ensure the inference output directory exists
INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)
def get_people():
"""List the people available for voice conversion."""
return sorted([p.stem for p in PEOPLE_DIR.iterdir() if p.is_dir()])
def get_songs():
"""List the song directories available for selection."""
# Use a set to avoid adding duplicate folder names
folder_names = set()
# Iterate through all directories and subdirectories in SONGS_DIR
for root, dirs, _ in os.walk(SONGS_DIR):
for dir in dirs:
# Construct the relative path from SONGS_DIR to the current directory
relative_path = os.path.relpath(os.path.join(root, dir), SONGS_DIR)
# Add the relative path of directories to the set
folder_names.add(relative_path)
# Return a sorted list of unique folder names
return sorted(folder_names)
def run_inference(speaker, path_str, f0_method, transpose, noise_scale, cluster_ratio):
# Ensure path is a Path object
path = Path(path_str)
model_path = speaker["model_path"]
config_path = speaker["cfg_path"]
cluster_path = speaker["cluster_path"]
output_path = Path(INFERENCE_OUTPUT_DIRNAME, path.name) # Now path is guaranteed to be a Path object
cluster_args = f"-k \"{cluster_path}\" -r {cluster_ratio}" if cluster_path and cluster_ratio > 0 else ""
# Ensure output directory exists
INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)
inference_cmd = f"svc infer \"{path.absolute()}\" -m \"{model_path}\" -c \"{config_path}\" {cluster_args} -t {transpose} --f0-method crepe -n 0.4 -o \"{output_path}\" --no-auto-predict-f0"
command = shlex.split(inference_cmd)
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
print(f"Command failed with return code {result.returncode}")
print("STDERR:", result.stderr)
return None, "⚠️ Error during inference."
if "AttributeError" in result.stderr:
return None, "⚠️ Modelo SVC incompatible."
if not output_path.exists():
print("Expected output file not found:", output_path)
return None, "⚠️ Error: Output file not found."
return str(output_path), None
def get_speaker_details(speaker_name):
speakers = [] # This should ideally be a list of dictionaries
# Assuming MODELOS is a directory containing subdirectories for each speaker/model
for _, dirs, _ in os.walk(PEOPLE_DIR):
for dir_name in dirs:
if dir_name.lower() == speaker_name.lower():
speaker_path = PEOPLE_DIR / dir_name
model_path = next(speaker_path.glob('G_*.pth'), None)
description_path = speaker_path / "description.txt"
image_path = next(speaker_path.glob('image.png'), None)
cfg_path = next(speaker_path.glob('*.json'), None)
if model_path and cfg_path:
return {
"model_path": model_path,
"cfg_path": cfg_path,
"description_path": description_path,
"cluster_path": "",
"image_path": image_path
}
return None
def mix_audio(vocals_path, instrumentals_path, output_path):
y_vocals, sr = librosa.load(vocals_path, sr=None)
y_instrumentals, _ = librosa.load(instrumentals_path, sr=None)
# Ensure both tracks are of the same length
max_length = max(len(y_vocals), len(y_instrumentals))
y_vocals_padded = np.pad(y_vocals, (0, max_length - len(y_vocals)), mode='constant')
y_instrumentals_padded = np.pad(y_instrumentals, (0, max_length - len(y_instrumentals)), mode='constant')
# Mix the two audio files
mixed_audio = 0.5 * y_vocals_padded + 0.5 * y_instrumentals_padded
# Save the mixed audio to a new WAV file
wavfile.write(output_path, sr, (mixed_audio * 32767).astype(np.int16))
def gen_caption_image(song_name, person):
prompt = f"Create a creative and engaging Instagram caption for a post where {person} is performing the song '{song_name}'. Make it exciting and suitable for social media."
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": prompt}
]
)
caption= response.choices[0].message.content
image_prompt=f'draw an image for my instagram post having for the song {song_name} and the caption {caption}'
responseimage = client.images.generate(
model="dall-e-3",
prompt=image_prompt,
size="1024x1024",
quality="standard",
n=1,
)
image_url = responseimage.data[0].url
image_response = requests.get(image_url)
img = Image.open(BytesIO(image_response.content))
return np.array(img) , caption
def voice_conversion(person, song_selection):
try:
speaker_details = get_speaker_details(person)
if not speaker_details:
raise Exception("Speaker not found! Error: Speaker details not found.")
# Extract the folder name from the song selection (ignoring the file part)
folder_name = song_selection.split('\\')[0]
vocals_path = SONGS_DIR / folder_name / "vocals.wav"
instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"
if not vocals_path.exists() or not instrumentals_path.exists():
raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")
converted_vocals_path, error = run_inference(speaker_details, str(vocals_path), 0, 0, 0.4, 0)
if error:
raise Exception(error)
converted_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_converted_mix.wav")
mix_audio(converted_vocals_path, str(instrumentals_path), converted_mix_path)
return converted_mix_path
except Exception as e:
return None, None, str(e), ""
def update_person_details(selected_person):
speaker_details = get_speaker_details(selected_person)
image_path = str(speaker_details["image_path"])
# Return the image path in a list as expected by the Gallery component
with open(speaker_details["description_path"], 'r') as file:
person_description = file.read()
return image_path, person_description
def update_song_features(song_selection):
cover_image_path = COVER_IMAGE_PATH
folder_name = song_selection.split('\\')[0]
vocals_path = SONGS_DIR / folder_name / "vocals.wav"
instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"
if not vocals_path.exists() or not instrumentals_path.exists():
raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")
original_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_original_mix.wav")
mix_audio(str(vocals_path), str(instrumentals_path), original_mix_path)
return original_mix_path, cover_image_path
# Updating the interface setup to include the event listener
# CSS for artistic theme
# CSS for artistic theme and custom positioning
css = """
@import url('https://fonts.googleapis.com/css2?family=Pacifico&display=swap');
.center-container {
display: flex;
justify-content: center;
align-items: center;
}
.right-container {
display: flex;
justify-content: flex-end;
align-items: center;
}
/* Additional custom CSS for spacing and alignment */
.logo-img {
max-width: 100%;
height: auto;
display: block;
margin-left: auto;
margin-right: auto;
}
.alchemy-img {
float: right;
max-height: 60px; /* Adjust based on your preference */
}
"""
header_markdown = """
<div style="text-align: center; font-weight: bold; font-size: 18px;">
<span style="margin-right: 20px;">TECHNATION</span>
<span style="font-size: 24px;">VoiceBlend by Alchemy AI</span>
<span style="margin-left: 20px;">McGill University</span>
</div>
"""
# Gradio app with custom CSS and theme
with gr.Blocks(css=css) as app:
with gr.Row():
# gr.Image(logo_image_path, container=False, show_label=False, show_download_button=False, height=100, width=100)
gr.Markdown(header_markdown)
with gr.Row():
with gr.Column():
gr.Markdown("#### Step 1: Pick Your Track 🎵")
song_dropdown = gr.Dropdown(label="Pick one of 5 songs:", choices=get_songs())
album_image = gr.Image(label="Cover Image", height=300)
original_mix_audio = gr.Audio(label="Original Mix (Vocals + Instrumentals)", interactive=False)
song_dropdown.change(fn=update_song_features, inputs=song_dropdown, outputs=[original_mix_audio, album_image])
with gr.Column():
gr.Markdown("#### Step 2: Select Your Voice to Emulate 👩🏻")
person_dropdown = gr.Dropdown(label="Select one of 15 voices:", choices=get_people())
person_image = gr.Image(label="Person", height=300)
person_description_text = gr.Textbox(label="Description:")
person_dropdown.change(update_person_details, inputs=person_dropdown, outputs=[person_image, person_description_text])
convert_button = gr.Button("Convert")
with gr.Column():
gr.Markdown("#### Your Custom AI Vocal - Unveiled! 🤖")
converted_mix_audio = gr.Audio(label="Converted Mix (Converted Vocals + Instrumentals)", type="filepath")
convert_button.click(voice_conversion, inputs=[person_dropdown, song_dropdown], outputs=[converted_mix_audio])
generate_button = gr.Button("Generate Social Media Post")
generate_button.click(gen_caption_image, inputs=[song_dropdown, person_dropdown], outputs=[gr.Image(label="Generated Image", type="numpy", height=300), gr.Textbox(label="Generated Caption")])
if __name__ == "__main__":
app.launch(share = True)