Spaces:
Sleeping
Sleeping
File size: 10,865 Bytes
db2145b fe1a783 5fa6df7 c8086c6 db2145b c83ed3b 29f6f2a c83ed3b db2145b 671a73c c83ed3b db2145b 5fa6df7 db2145b 5fa6df7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
from threading import Thread
from pathlib import Path
import gradio as gr
import subprocess
import shutil
import time
import copy
import glob
import json
import os
import shlex
import numpy as np
import librosa
from scipy.io import wavfile
import openai
from openai import OpenAI
from dotenv import load_dotenv
from PIL import Image
import requests
from io import BytesIO
load_dotenv()
OpenAI.api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(
)
# Paths according to your directory structure
CURRENT_DIR = Path(__file__).resolve().parent
PEOPLE_DIR = CURRENT_DIR / "people"
SONGS_DIR = CURRENT_DIR / "songs"
INFERENCE_OUTPUT_DIRNAME = CURRENT_DIR / "inference_output"
COVER_IMAGE_PATH = SONGS_DIR / "cover.png"
logo_image_path = os.path.abspath("Logo.png")
# Ensure the inference output directory exists
INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)
def get_people():
"""List the people available for voice conversion."""
return sorted([p.stem for p in PEOPLE_DIR.iterdir() if p.is_dir()])
def get_songs():
"""List the song directories available for selection."""
# Use a set to avoid adding duplicate folder names
folder_names = set()
# Iterate through all directories and subdirectories in SONGS_DIR
for root, dirs, _ in os.walk(SONGS_DIR):
for dir in dirs:
# Construct the relative path from SONGS_DIR to the current directory
relative_path = os.path.relpath(os.path.join(root, dir), SONGS_DIR)
# Add the relative path of directories to the set
folder_names.add(relative_path)
# Return a sorted list of unique folder names
return sorted(folder_names)
def run_inference(speaker, path_str, f0_method, transpose, noise_scale, cluster_ratio):
# Ensure path is a Path object
path = Path(path_str)
model_path = speaker["model_path"]
config_path = speaker["cfg_path"]
cluster_path = speaker["cluster_path"]
output_path = Path(INFERENCE_OUTPUT_DIRNAME, path.name) # Now path is guaranteed to be a Path object
cluster_args = f"-k \"{cluster_path}\" -r {cluster_ratio}" if cluster_path and cluster_ratio > 0 else ""
# Ensure output directory exists
INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)
inference_cmd = f"svc infer \"{path.absolute()}\" -m \"{model_path}\" -c \"{config_path}\" {cluster_args} -t {transpose} --f0-method crepe -n 0.4 -o \"{output_path}\" --no-auto-predict-f0"
command = shlex.split(inference_cmd)
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
print(f"Command failed with return code {result.returncode}")
print("STDERR:", result.stderr)
return None, "⚠️ Error during inference."
if "AttributeError" in result.stderr:
return None, "⚠️ Modelo SVC incompatible."
if not output_path.exists():
print("Expected output file not found:", output_path)
return None, "⚠️ Error: Output file not found."
return str(output_path), None
def get_speaker_details(speaker_name):
speakers = [] # This should ideally be a list of dictionaries
# Assuming MODELOS is a directory containing subdirectories for each speaker/model
for _, dirs, _ in os.walk(PEOPLE_DIR):
for dir_name in dirs:
if dir_name.lower() == speaker_name.lower():
speaker_path = PEOPLE_DIR / dir_name
model_path = next(speaker_path.glob('G_*.pth'), None)
description_path = speaker_path / "description.txt"
image_path = next(speaker_path.glob('image.png'), None)
cfg_path = next(speaker_path.glob('*.json'), None)
if model_path and cfg_path:
return {
"model_path": model_path,
"cfg_path": cfg_path,
"description_path": description_path,
"cluster_path": "",
"image_path": image_path
}
return None
def mix_audio(vocals_path, instrumentals_path, output_path):
y_vocals, sr = librosa.load(vocals_path, sr=None)
y_instrumentals, _ = librosa.load(instrumentals_path, sr=None)
# Ensure both tracks are of the same length
max_length = max(len(y_vocals), len(y_instrumentals))
y_vocals_padded = np.pad(y_vocals, (0, max_length - len(y_vocals)), mode='constant')
y_instrumentals_padded = np.pad(y_instrumentals, (0, max_length - len(y_instrumentals)), mode='constant')
# Mix the two audio files
mixed_audio = 0.5 * y_vocals_padded + 0.5 * y_instrumentals_padded
# Save the mixed audio to a new WAV file
wavfile.write(output_path, sr, (mixed_audio * 32767).astype(np.int16))
def gen_caption_image(song_name, person):
prompt = f"Create a creative and engaging Instagram caption for a post where {person} is performing the song '{song_name}'. Make it exciting and suitable for social media."
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": prompt}
]
)
caption= response.choices[0].message.content
image_prompt=f'draw an image for my instagram post having for the song {song_name} and the caption {caption}'
responseimage = client.images.generate(
model="dall-e-3",
prompt=image_prompt,
size="1024x1024",
quality="standard",
n=1,
)
image_url = responseimage.data[0].url
image_response = requests.get(image_url)
img = Image.open(BytesIO(image_response.content))
return np.array(img) , caption
def voice_conversion(person, song_selection):
try:
speaker_details = get_speaker_details(person)
if not speaker_details:
raise Exception("Speaker not found! Error: Speaker details not found.")
# Extract the folder name from the song selection (ignoring the file part)
folder_name = song_selection.split('\\')[0]
vocals_path = SONGS_DIR / folder_name / "vocals.wav"
instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"
if not vocals_path.exists() or not instrumentals_path.exists():
raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")
converted_vocals_path, error = run_inference(speaker_details, str(vocals_path), 0, 0, 0.4, 0)
if error:
raise Exception(error)
converted_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_converted_mix.wav")
mix_audio(converted_vocals_path, str(instrumentals_path), converted_mix_path)
return converted_mix_path
except Exception as e:
return None, None, str(e), ""
def update_person_details(selected_person):
speaker_details = get_speaker_details(selected_person)
image_path = str(speaker_details["image_path"])
# Return the image path in a list as expected by the Gallery component
with open(speaker_details["description_path"], 'r') as file:
person_description = file.read()
return image_path, person_description
def update_song_features(song_selection):
cover_image_path = COVER_IMAGE_PATH
folder_name = song_selection.split('\\')[0]
vocals_path = SONGS_DIR / folder_name / "vocals.wav"
instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"
if not vocals_path.exists() or not instrumentals_path.exists():
raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")
original_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_original_mix.wav")
mix_audio(str(vocals_path), str(instrumentals_path), original_mix_path)
return original_mix_path, cover_image_path
# Updating the interface setup to include the event listener
# CSS for artistic theme
# CSS for artistic theme and custom positioning
css = """
@import url('https://fonts.googleapis.com/css2?family=Pacifico&display=swap');
.center-container {
display: flex;
justify-content: center;
align-items: center;
}
.right-container {
display: flex;
justify-content: flex-end;
align-items: center;
}
/* Additional custom CSS for spacing and alignment */
.logo-img {
max-width: 100%;
height: auto;
display: block;
margin-left: auto;
margin-right: auto;
}
.alchemy-img {
float: right;
max-height: 60px; /* Adjust based on your preference */
}
"""
header_markdown = """
<div style="text-align: center; font-weight: bold; font-size: 18px;">
<span style="margin-right: 20px;">TECHNATION</span>
<span style="font-size: 24px;">VoiceBlend by Alchemy AI</span>
<span style="margin-left: 20px;">McGill University</span>
</div>
"""
# Gradio app with custom CSS and theme
with gr.Blocks(css=css) as app:
with gr.Row():
# gr.Image(logo_image_path, container=False, show_label=False, show_download_button=False, height=100, width=100)
gr.Markdown(header_markdown)
with gr.Row():
with gr.Column():
gr.Markdown("#### Step 1: Pick Your Track 🎵")
song_dropdown = gr.Dropdown(label="Pick one of 5 songs:", choices=get_songs())
album_image = gr.Image(label="Cover Image", height=300)
original_mix_audio = gr.Audio(label="Original Mix (Vocals + Instrumentals)", interactive=False)
song_dropdown.change(fn=update_song_features, inputs=song_dropdown, outputs=[original_mix_audio, album_image])
with gr.Column():
gr.Markdown("#### Step 2: Select Your Voice to Emulate 👩🏻")
person_dropdown = gr.Dropdown(label="Select one of 15 voices:", choices=get_people())
person_image = gr.Image(label="Person", height=300)
person_description_text = gr.Textbox(label="Description:")
person_dropdown.change(update_person_details, inputs=person_dropdown, outputs=[person_image, person_description_text])
convert_button = gr.Button("Convert")
with gr.Column():
gr.Markdown("#### Your Custom AI Vocal - Unveiled! 🤖")
converted_mix_audio = gr.Audio(label="Converted Mix (Converted Vocals + Instrumentals)", type="filepath")
convert_button.click(voice_conversion, inputs=[person_dropdown, song_dropdown], outputs=[converted_mix_audio])
generate_button = gr.Button("Generate Social Media Post")
generate_button.click(gen_caption_image, inputs=[song_dropdown, person_dropdown], outputs=[gr.Image(label="Generated Image", type="numpy", height=300), gr.Textbox(label="Generated Caption")])
if __name__ == "__main__":
app.launch(share = True) |