Spaces:

krishangupta33
/

Voice-Blend_Alchemy-AI

Sleeping

File size: 10,865 Bytes

from threading import Thread
from pathlib import Path
import gradio as gr
import subprocess
import shutil
import time
import copy
import glob
import json
import os
import shlex
import numpy as np
import librosa
from scipy.io import wavfile
import openai
from openai import OpenAI
from dotenv import load_dotenv
from PIL import Image
import requests
from io import BytesIO

load_dotenv()

OpenAI.api_key = os.getenv('OPENAI_API_KEY')


client = OpenAI(
)

# Paths according to your directory structure
CURRENT_DIR = Path(__file__).resolve().parent
PEOPLE_DIR = CURRENT_DIR / "people"
SONGS_DIR = CURRENT_DIR / "songs"
INFERENCE_OUTPUT_DIRNAME = CURRENT_DIR / "inference_output"
COVER_IMAGE_PATH = SONGS_DIR / "cover.png"

logo_image_path = os.path.abspath("Logo.png")


# Ensure the inference output directory exists
INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)

def get_people():
    """List the people available for voice conversion."""
    return sorted([p.stem for p in PEOPLE_DIR.iterdir() if p.is_dir()])

def get_songs():
    """List the song directories available for selection."""
    # Use a set to avoid adding duplicate folder names
    folder_names = set()
    # Iterate through all directories and subdirectories in SONGS_DIR
    for root, dirs, _ in os.walk(SONGS_DIR):
        for dir in dirs:
            # Construct the relative path from SONGS_DIR to the current directory
            relative_path = os.path.relpath(os.path.join(root, dir), SONGS_DIR)
            # Add the relative path of directories to the set
            folder_names.add(relative_path)
    # Return a sorted list of unique folder names
    return sorted(folder_names)


def run_inference(speaker, path_str, f0_method, transpose, noise_scale, cluster_ratio):
    # Ensure path is a Path object
    path = Path(path_str)

    model_path = speaker["model_path"]
    config_path = speaker["cfg_path"]
    cluster_path = speaker["cluster_path"]
    output_path = Path(INFERENCE_OUTPUT_DIRNAME, path.name)  # Now path is guaranteed to be a Path object
    cluster_args = f"-k \"{cluster_path}\" -r {cluster_ratio}" if cluster_path and cluster_ratio > 0 else ""
    
    # Ensure output directory exists
    INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)
    
    inference_cmd = f"svc infer \"{path.absolute()}\" -m \"{model_path}\" -c \"{config_path}\" {cluster_args} -t {transpose} --f0-method crepe -n 0.4 -o \"{output_path}\" --no-auto-predict-f0"
    command = shlex.split(inference_cmd)
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    
    if result.returncode != 0:
        print(f"Command failed with return code {result.returncode}")
        print("STDERR:", result.stderr)
        return None, "⚠️ Error during inference."
    
    if "AttributeError" in result.stderr:
        return None, "⚠️ Modelo SVC incompatible."
    
    if not output_path.exists():
        print("Expected output file not found:", output_path)
        return None, "⚠️ Error: Output file not found."
    
    return str(output_path), None

def get_speaker_details(speaker_name):
    speakers = []  # This should ideally be a list of dictionaries
    # Assuming MODELOS is a directory containing subdirectories for each speaker/model
    for _, dirs, _ in os.walk(PEOPLE_DIR):
        for dir_name in dirs:
            if dir_name.lower() == speaker_name.lower():
                speaker_path = PEOPLE_DIR / dir_name
                model_path = next(speaker_path.glob('G_*.pth'), None)
                description_path = speaker_path / "description.txt"
                image_path = next(speaker_path.glob('image.png'), None) 
                cfg_path = next(speaker_path.glob('*.json'), None)
                if model_path and cfg_path:
                    return {
                        "model_path": model_path,
                        "cfg_path": cfg_path,
                        "description_path": description_path, 
                        "cluster_path": "",
                        "image_path": image_path                        
                    }
    return None

def mix_audio(vocals_path, instrumentals_path, output_path):
    y_vocals, sr = librosa.load(vocals_path, sr=None)
    y_instrumentals, _ = librosa.load(instrumentals_path, sr=None)

    # Ensure both tracks are of the same length
    max_length = max(len(y_vocals), len(y_instrumentals))
    y_vocals_padded = np.pad(y_vocals, (0, max_length - len(y_vocals)), mode='constant')
    y_instrumentals_padded = np.pad(y_instrumentals, (0, max_length - len(y_instrumentals)), mode='constant')

    # Mix the two audio files
    mixed_audio = 0.5 * y_vocals_padded + 0.5 * y_instrumentals_padded

    # Save the mixed audio to a new WAV file
    wavfile.write(output_path, sr, (mixed_audio * 32767).astype(np.int16))

def gen_caption_image(song_name, person):
    prompt = f"Create a creative and engaging Instagram caption for a post where {person} is performing the song '{song_name}'. Make it exciting and suitable for social media."

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ]
        )
    
    caption= response.choices[0].message.content

    image_prompt=f'draw an image for my instagram post having for the song {song_name} and the caption {caption}'

    responseimage = client.images.generate(
        model="dall-e-3",
        prompt=image_prompt,
        size="1024x1024",
        quality="standard",
        n=1,
        )
    image_url = responseimage.data[0].url

    image_response = requests.get(image_url)
    img = Image.open(BytesIO(image_response.content))
    

    return np.array(img) , caption

def voice_conversion(person, song_selection):
    try:
        speaker_details = get_speaker_details(person)
        if not speaker_details:
            raise Exception("Speaker not found! Error: Speaker details not found.")

        # Extract the folder name from the song selection (ignoring the file part)
        folder_name = song_selection.split('\\')[0]

        vocals_path = SONGS_DIR / folder_name / "vocals.wav"
        instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"

        if not vocals_path.exists() or not instrumentals_path.exists():
            raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")

        converted_vocals_path, error = run_inference(speaker_details, str(vocals_path), 0, 0, 0.4, 0)
        if error:
            raise Exception(error)

        converted_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_converted_mix.wav")
        mix_audio(converted_vocals_path, str(instrumentals_path), converted_mix_path)

        return converted_mix_path
    except Exception as e:
        return None, None, str(e), "" 

def update_person_details(selected_person):
    speaker_details = get_speaker_details(selected_person)
    image_path = str(speaker_details["image_path"])
    # Return the image path in a list as expected by the Gallery component
    with open(speaker_details["description_path"], 'r') as file:
        person_description = file.read()
    return image_path, person_description



def update_song_features(song_selection):
    cover_image_path = COVER_IMAGE_PATH
    folder_name = song_selection.split('\\')[0]
    vocals_path = SONGS_DIR / folder_name / "vocals.wav"
    instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"
    if not vocals_path.exists() or not instrumentals_path.exists():
        raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")

    original_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_original_mix.wav")
    mix_audio(str(vocals_path), str(instrumentals_path), original_mix_path)

    return original_mix_path, cover_image_path

# Updating the interface setup to include the event listener

# CSS for artistic theme
# CSS for artistic theme and custom positioning
css = """
@import url('https://fonts.googleapis.com/css2?family=Pacifico&display=swap');
.center-container {
    display: flex;
    justify-content: center;
    align-items: center;
}
.right-container {
    display: flex;
    justify-content: flex-end;
    align-items: center;
}
/* Additional custom CSS for spacing and alignment */
.logo-img {
    max-width: 100%;
    height: auto;
    display: block;
    margin-left: auto;
    margin-right: auto;
}
.alchemy-img {
    float: right;
    max-height: 60px; /* Adjust based on your preference */
}
"""

header_markdown = """
<div style="text-align: center; font-weight: bold; font-size: 18px;">
    <span style="margin-right: 20px;">TECHNATION</span>
    <span style="font-size: 24px;">VoiceBlend by Alchemy AI</span>
    <span style="margin-left: 20px;">McGill University</span>
</div>
"""

# Gradio app with custom CSS and theme
with gr.Blocks(css=css) as app:
    with gr.Row():
            # gr.Image(logo_image_path, container=False, show_label=False, show_download_button=False, height=100, width=100)
            gr.Markdown(header_markdown)



    with gr.Row():
        with gr.Column():
            gr.Markdown("#### Step 1: Pick Your Track 🎵")
            song_dropdown = gr.Dropdown(label="Pick one of 5 songs:", choices=get_songs())
            album_image = gr.Image(label="Cover Image", height=300)
            original_mix_audio = gr.Audio(label="Original Mix (Vocals + Instrumentals)", interactive=False)
            song_dropdown.change(fn=update_song_features, inputs=song_dropdown, outputs=[original_mix_audio, album_image])

        with gr.Column():
            gr.Markdown("#### Step 2: Select Your Voice to Emulate 👩🏻")
            person_dropdown = gr.Dropdown(label="Select one of 15 voices:", choices=get_people())
            person_image = gr.Image(label="Person", height=300)
            person_description_text = gr.Textbox(label="Description:")
            person_dropdown.change(update_person_details, inputs=person_dropdown, outputs=[person_image, person_description_text])
            
            convert_button = gr.Button("Convert")

        with gr.Column():
            gr.Markdown("#### Your Custom AI Vocal - Unveiled! 🤖")
            converted_mix_audio = gr.Audio(label="Converted Mix (Converted Vocals + Instrumentals)", type="filepath")
            convert_button.click(voice_conversion, inputs=[person_dropdown, song_dropdown], outputs=[converted_mix_audio])

            generate_button = gr.Button("Generate Social Media Post")
            generate_button.click(gen_caption_image, inputs=[song_dropdown, person_dropdown], outputs=[gr.Image(label="Generated Image", type="numpy", height=300), gr.Textbox(label="Generated Caption")])

if __name__ == "__main__":
    app.launch(share = True)