Spaces:
Sleeping
Sleeping
| from threading import Thread | |
| from pathlib import Path | |
| import gradio as gr | |
| import subprocess | |
| import shutil | |
| import time | |
| import copy | |
| import glob | |
| import json | |
| import os | |
| import shlex | |
| import numpy as np | |
| import librosa | |
| from scipy.io import wavfile | |
| import openai | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| from PIL import Image | |
| import requests | |
| from io import BytesIO | |
| load_dotenv() | |
| OpenAI.api_key = os.getenv('OPENAI_API_KEY') | |
| client = OpenAI( | |
| ) | |
| # Paths according to your directory structure | |
| CURRENT_DIR = Path(__file__).resolve().parent | |
| PEOPLE_DIR = CURRENT_DIR / "people" | |
| SONGS_DIR = CURRENT_DIR / "songs" | |
| INFERENCE_OUTPUT_DIRNAME = CURRENT_DIR / "inference_output" | |
| COVER_IMAGE_PATH = SONGS_DIR / "cover.png" | |
| logo_image_path = os.path.abspath("Logo.png") | |
| # Ensure the inference output directory exists | |
| INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True) | |
| def get_people(): | |
| """List the people available for voice conversion.""" | |
| return sorted([p.stem for p in PEOPLE_DIR.iterdir() if p.is_dir()]) | |
| def get_songs(): | |
| """List the song directories available for selection.""" | |
| # Use a set to avoid adding duplicate folder names | |
| folder_names = set() | |
| # Iterate through all directories and subdirectories in SONGS_DIR | |
| for root, dirs, _ in os.walk(SONGS_DIR): | |
| for dir in dirs: | |
| # Construct the relative path from SONGS_DIR to the current directory | |
| relative_path = os.path.relpath(os.path.join(root, dir), SONGS_DIR) | |
| # Add the relative path of directories to the set | |
| folder_names.add(relative_path) | |
| # Return a sorted list of unique folder names | |
| return sorted(folder_names) | |
| def run_inference(speaker, path_str, f0_method, transpose, noise_scale, cluster_ratio): | |
| # Ensure path is a Path object | |
| path = Path(path_str) | |
| model_path = speaker["model_path"] | |
| config_path = speaker["cfg_path"] | |
| cluster_path = speaker["cluster_path"] | |
| output_path = Path(INFERENCE_OUTPUT_DIRNAME, path.name) # Now path is guaranteed to be a Path object | |
| cluster_args = f"-k \"{cluster_path}\" -r {cluster_ratio}" if cluster_path and cluster_ratio > 0 else "" | |
| # Ensure output directory exists | |
| INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True) | |
| inference_cmd = f"svc infer \"{path.absolute()}\" -m \"{model_path}\" -c \"{config_path}\" {cluster_args} -t {transpose} --f0-method crepe -n 0.4 -o \"{output_path}\" --no-auto-predict-f0" | |
| command = shlex.split(inference_cmd) | |
| result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | |
| if result.returncode != 0: | |
| print(f"Command failed with return code {result.returncode}") | |
| print("STDERR:", result.stderr) | |
| return None, "⚠️ Error during inference." | |
| if "AttributeError" in result.stderr: | |
| return None, "⚠️ Modelo SVC incompatible." | |
| if not output_path.exists(): | |
| print("Expected output file not found:", output_path) | |
| return None, "⚠️ Error: Output file not found." | |
| return str(output_path), None | |
| def get_speaker_details(speaker_name): | |
| speakers = [] # This should ideally be a list of dictionaries | |
| # Assuming MODELOS is a directory containing subdirectories for each speaker/model | |
| for _, dirs, _ in os.walk(PEOPLE_DIR): | |
| for dir_name in dirs: | |
| if dir_name.lower() == speaker_name.lower(): | |
| speaker_path = PEOPLE_DIR / dir_name | |
| model_path = next(speaker_path.glob('G_*.pth'), None) | |
| description_path = speaker_path / "description.txt" | |
| image_path = next(speaker_path.glob('image.png'), None) | |
| cfg_path = next(speaker_path.glob('*.json'), None) | |
| if model_path and cfg_path: | |
| return { | |
| "model_path": model_path, | |
| "cfg_path": cfg_path, | |
| "description_path": description_path, | |
| "cluster_path": "", | |
| "image_path": image_path | |
| } | |
| return None | |
| def mix_audio(vocals_path, instrumentals_path, output_path): | |
| y_vocals, sr = librosa.load(vocals_path, sr=None) | |
| y_instrumentals, _ = librosa.load(instrumentals_path, sr=None) | |
| # Ensure both tracks are of the same length | |
| max_length = max(len(y_vocals), len(y_instrumentals)) | |
| y_vocals_padded = np.pad(y_vocals, (0, max_length - len(y_vocals)), mode='constant') | |
| y_instrumentals_padded = np.pad(y_instrumentals, (0, max_length - len(y_instrumentals)), mode='constant') | |
| # Mix the two audio files | |
| mixed_audio = 0.5 * y_vocals_padded + 0.5 * y_instrumentals_padded | |
| # Save the mixed audio to a new WAV file | |
| wavfile.write(output_path, sr, (mixed_audio * 32767).astype(np.int16)) | |
| def gen_caption_image(song_name, person): | |
| prompt = f"Create a creative and engaging Instagram caption for a post where {person} is performing the song '{song_name}'. Make it exciting and suitable for social media." | |
| response = client.chat.completions.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ] | |
| ) | |
| caption= response.choices[0].message.content | |
| image_prompt=f'draw an image for my instagram post having for the song {song_name} and the caption {caption}' | |
| responseimage = client.images.generate( | |
| model="dall-e-3", | |
| prompt=image_prompt, | |
| size="1024x1024", | |
| quality="standard", | |
| n=1, | |
| ) | |
| image_url = responseimage.data[0].url | |
| image_response = requests.get(image_url) | |
| img = Image.open(BytesIO(image_response.content)) | |
| return np.array(img) , caption | |
| def voice_conversion(person, song_selection): | |
| try: | |
| speaker_details = get_speaker_details(person) | |
| if not speaker_details: | |
| raise Exception("Speaker not found! Error: Speaker details not found.") | |
| # Extract the folder name from the song selection (ignoring the file part) | |
| folder_name = song_selection.split('\\')[0] | |
| vocals_path = SONGS_DIR / folder_name / "vocals.wav" | |
| instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav" | |
| if not vocals_path.exists() or not instrumentals_path.exists(): | |
| raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.") | |
| converted_vocals_path, error = run_inference(speaker_details, str(vocals_path), 0, 0, 0.4, 0) | |
| if error: | |
| raise Exception(error) | |
| converted_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_converted_mix.wav") | |
| mix_audio(converted_vocals_path, str(instrumentals_path), converted_mix_path) | |
| return converted_mix_path | |
| except Exception as e: | |
| return None, None, str(e), "" | |
| def update_person_details(selected_person): | |
| speaker_details = get_speaker_details(selected_person) | |
| image_path = str(speaker_details["image_path"]) | |
| # Return the image path in a list as expected by the Gallery component | |
| with open(speaker_details["description_path"], 'r') as file: | |
| person_description = file.read() | |
| return image_path, person_description | |
| def update_song_features(song_selection): | |
| cover_image_path = COVER_IMAGE_PATH | |
| folder_name = song_selection.split('\\')[0] | |
| vocals_path = SONGS_DIR / folder_name / "vocals.wav" | |
| instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav" | |
| if not vocals_path.exists() or not instrumentals_path.exists(): | |
| raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.") | |
| original_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_original_mix.wav") | |
| mix_audio(str(vocals_path), str(instrumentals_path), original_mix_path) | |
| return original_mix_path, cover_image_path | |
| # Updating the interface setup to include the event listener | |
| # CSS for artistic theme | |
| # CSS for artistic theme and custom positioning | |
| css = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Pacifico&display=swap'); | |
| .center-container { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| } | |
| .right-container { | |
| display: flex; | |
| justify-content: flex-end; | |
| align-items: center; | |
| } | |
| /* Additional custom CSS for spacing and alignment */ | |
| .logo-img { | |
| max-width: 100%; | |
| height: auto; | |
| display: block; | |
| margin-left: auto; | |
| margin-right: auto; | |
| } | |
| .alchemy-img { | |
| float: right; | |
| max-height: 60px; /* Adjust based on your preference */ | |
| } | |
| """ | |
| header_markdown = """ | |
| <div style="text-align: center; font-weight: bold; font-size: 18px;"> | |
| <span style="margin-right: 20px;">TECHNATION</span> | |
| <span style="font-size: 24px;">VoiceBlend by Alchemy AI</span> | |
| <span style="margin-left: 20px;">McGill University</span> | |
| </div> | |
| """ | |
| # Gradio app with custom CSS and theme | |
| with gr.Blocks(css=css) as app: | |
| with gr.Row(): | |
| # gr.Image(logo_image_path, container=False, show_label=False, show_download_button=False, height=100, width=100) | |
| gr.Markdown(header_markdown) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("#### Step 1: Pick Your Track 🎵") | |
| song_dropdown = gr.Dropdown(label="Pick one of 5 songs:", choices=get_songs()) | |
| album_image = gr.Image(label="Cover Image", height=300) | |
| original_mix_audio = gr.Audio(label="Original Mix (Vocals + Instrumentals)", interactive=False) | |
| song_dropdown.change(fn=update_song_features, inputs=song_dropdown, outputs=[original_mix_audio, album_image]) | |
| with gr.Column(): | |
| gr.Markdown("#### Step 2: Select Your Voice to Emulate 👩🏻") | |
| person_dropdown = gr.Dropdown(label="Select one of 15 voices:", choices=get_people()) | |
| person_image = gr.Image(label="Person", height=300) | |
| person_description_text = gr.Textbox(label="Description:") | |
| person_dropdown.change(update_person_details, inputs=person_dropdown, outputs=[person_image, person_description_text]) | |
| convert_button = gr.Button("Convert") | |
| with gr.Column(): | |
| gr.Markdown("#### Your Custom AI Vocal - Unveiled! 🤖") | |
| converted_mix_audio = gr.Audio(label="Converted Mix (Converted Vocals + Instrumentals)", type="filepath") | |
| convert_button.click(voice_conversion, inputs=[person_dropdown, song_dropdown], outputs=[converted_mix_audio]) | |
| generate_button = gr.Button("Generate Social Media Post") | |
| generate_button.click(gen_caption_image, inputs=[song_dropdown, person_dropdown], outputs=[gr.Image(label="Generated Image", type="numpy", height=300), gr.Textbox(label="Generated Caption")]) | |
| if __name__ == "__main__": | |
| app.launch(share = True) |