File size: 10,865 Bytes
db2145b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe1a783
5fa6df7
c8086c6
db2145b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c83ed3b
 
29f6f2a
 
 
c83ed3b
 
 
db2145b
 
 
671a73c
c83ed3b
db2145b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fa6df7
db2145b
 
 
5fa6df7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
from threading import Thread
from pathlib import Path
import gradio as gr
import subprocess
import shutil
import time
import copy
import glob
import json
import os
import shlex
import numpy as np
import librosa
from scipy.io import wavfile
import openai
from openai import OpenAI
from dotenv import load_dotenv
from PIL import Image
import requests
from io import BytesIO

load_dotenv()

OpenAI.api_key = os.getenv('OPENAI_API_KEY')


client = OpenAI(
)

# Paths according to your directory structure
CURRENT_DIR = Path(__file__).resolve().parent
PEOPLE_DIR = CURRENT_DIR / "people"
SONGS_DIR = CURRENT_DIR / "songs"
INFERENCE_OUTPUT_DIRNAME = CURRENT_DIR / "inference_output"
COVER_IMAGE_PATH = SONGS_DIR / "cover.png"

logo_image_path = os.path.abspath("Logo.png")


# Ensure the inference output directory exists
INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)

def get_people():
    """List the people available for voice conversion."""
    return sorted([p.stem for p in PEOPLE_DIR.iterdir() if p.is_dir()])

def get_songs():
    """List the song directories available for selection."""
    # Use a set to avoid adding duplicate folder names
    folder_names = set()
    # Iterate through all directories and subdirectories in SONGS_DIR
    for root, dirs, _ in os.walk(SONGS_DIR):
        for dir in dirs:
            # Construct the relative path from SONGS_DIR to the current directory
            relative_path = os.path.relpath(os.path.join(root, dir), SONGS_DIR)
            # Add the relative path of directories to the set
            folder_names.add(relative_path)
    # Return a sorted list of unique folder names
    return sorted(folder_names)


def run_inference(speaker, path_str, f0_method, transpose, noise_scale, cluster_ratio):
    # Ensure path is a Path object
    path = Path(path_str)

    model_path = speaker["model_path"]
    config_path = speaker["cfg_path"]
    cluster_path = speaker["cluster_path"]
    output_path = Path(INFERENCE_OUTPUT_DIRNAME, path.name)  # Now path is guaranteed to be a Path object
    cluster_args = f"-k \"{cluster_path}\" -r {cluster_ratio}" if cluster_path and cluster_ratio > 0 else ""
    
    # Ensure output directory exists
    INFERENCE_OUTPUT_DIRNAME.mkdir(parents=True, exist_ok=True)
    
    inference_cmd = f"svc infer \"{path.absolute()}\" -m \"{model_path}\" -c \"{config_path}\" {cluster_args} -t {transpose} --f0-method crepe -n 0.4 -o \"{output_path}\" --no-auto-predict-f0"
    command = shlex.split(inference_cmd)
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    
    if result.returncode != 0:
        print(f"Command failed with return code {result.returncode}")
        print("STDERR:", result.stderr)
        return None, "⚠️ Error during inference."
    
    if "AttributeError" in result.stderr:
        return None, "⚠️ Modelo SVC incompatible."
    
    if not output_path.exists():
        print("Expected output file not found:", output_path)
        return None, "⚠️ Error: Output file not found."
    
    return str(output_path), None

def get_speaker_details(speaker_name):
    speakers = []  # This should ideally be a list of dictionaries
    # Assuming MODELOS is a directory containing subdirectories for each speaker/model
    for _, dirs, _ in os.walk(PEOPLE_DIR):
        for dir_name in dirs:
            if dir_name.lower() == speaker_name.lower():
                speaker_path = PEOPLE_DIR / dir_name
                model_path = next(speaker_path.glob('G_*.pth'), None)
                description_path = speaker_path / "description.txt"
                image_path = next(speaker_path.glob('image.png'), None) 
                cfg_path = next(speaker_path.glob('*.json'), None)
                if model_path and cfg_path:
                    return {
                        "model_path": model_path,
                        "cfg_path": cfg_path,
                        "description_path": description_path, 
                        "cluster_path": "",
                        "image_path": image_path                        
                    }
    return None

def mix_audio(vocals_path, instrumentals_path, output_path):
    y_vocals, sr = librosa.load(vocals_path, sr=None)
    y_instrumentals, _ = librosa.load(instrumentals_path, sr=None)

    # Ensure both tracks are of the same length
    max_length = max(len(y_vocals), len(y_instrumentals))
    y_vocals_padded = np.pad(y_vocals, (0, max_length - len(y_vocals)), mode='constant')
    y_instrumentals_padded = np.pad(y_instrumentals, (0, max_length - len(y_instrumentals)), mode='constant')

    # Mix the two audio files
    mixed_audio = 0.5 * y_vocals_padded + 0.5 * y_instrumentals_padded

    # Save the mixed audio to a new WAV file
    wavfile.write(output_path, sr, (mixed_audio * 32767).astype(np.int16))

def gen_caption_image(song_name, person):
    prompt = f"Create a creative and engaging Instagram caption for a post where {person} is performing the song '{song_name}'. Make it exciting and suitable for social media."

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ]
        )
    
    caption= response.choices[0].message.content

    image_prompt=f'draw an image for my instagram post having for the song {song_name} and the caption {caption}'

    responseimage = client.images.generate(
        model="dall-e-3",
        prompt=image_prompt,
        size="1024x1024",
        quality="standard",
        n=1,
        )
    image_url = responseimage.data[0].url

    image_response = requests.get(image_url)
    img = Image.open(BytesIO(image_response.content))
    

    return np.array(img) , caption

def voice_conversion(person, song_selection):
    try:
        speaker_details = get_speaker_details(person)
        if not speaker_details:
            raise Exception("Speaker not found! Error: Speaker details not found.")

        # Extract the folder name from the song selection (ignoring the file part)
        folder_name = song_selection.split('\\')[0]

        vocals_path = SONGS_DIR / folder_name / "vocals.wav"
        instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"

        if not vocals_path.exists() or not instrumentals_path.exists():
            raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")

        converted_vocals_path, error = run_inference(speaker_details, str(vocals_path), 0, 0, 0.4, 0)
        if error:
            raise Exception(error)

        converted_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_converted_mix.wav")
        mix_audio(converted_vocals_path, str(instrumentals_path), converted_mix_path)

        return converted_mix_path
    except Exception as e:
        return None, None, str(e), "" 

def update_person_details(selected_person):
    speaker_details = get_speaker_details(selected_person)
    image_path = str(speaker_details["image_path"])
    # Return the image path in a list as expected by the Gallery component
    with open(speaker_details["description_path"], 'r') as file:
        person_description = file.read()
    return image_path, person_description



def update_song_features(song_selection):
    cover_image_path = COVER_IMAGE_PATH
    folder_name = song_selection.split('\\')[0]
    vocals_path = SONGS_DIR / folder_name / "vocals.wav"
    instrumentals_path = SONGS_DIR / folder_name / "instrumentals.wav"
    if not vocals_path.exists() or not instrumentals_path.exists():
        raise Exception(f"Vocals or instrumentals not found in {folder_name}! Error: Audio file not found.")

    original_mix_path = str(INFERENCE_OUTPUT_DIRNAME / f"{folder_name}_original_mix.wav")
    mix_audio(str(vocals_path), str(instrumentals_path), original_mix_path)

    return original_mix_path, cover_image_path

# Updating the interface setup to include the event listener

# CSS for artistic theme
# CSS for artistic theme and custom positioning
css = """
@import url('https://fonts.googleapis.com/css2?family=Pacifico&display=swap');
.center-container {
    display: flex;
    justify-content: center;
    align-items: center;
}
.right-container {
    display: flex;
    justify-content: flex-end;
    align-items: center;
}
/* Additional custom CSS for spacing and alignment */
.logo-img {
    max-width: 100%;
    height: auto;
    display: block;
    margin-left: auto;
    margin-right: auto;
}
.alchemy-img {
    float: right;
    max-height: 60px; /* Adjust based on your preference */
}
"""

header_markdown = """
<div style="text-align: center; font-weight: bold; font-size: 18px;">
    <span style="margin-right: 20px;">TECHNATION</span>
    <span style="font-size: 24px;">VoiceBlend by Alchemy AI</span>
    <span style="margin-left: 20px;">McGill University</span>
</div>
"""

# Gradio app with custom CSS and theme
with gr.Blocks(css=css) as app:
    with gr.Row():
            # gr.Image(logo_image_path, container=False, show_label=False, show_download_button=False, height=100, width=100)
            gr.Markdown(header_markdown)



    with gr.Row():
        with gr.Column():
            gr.Markdown("#### Step 1: Pick Your Track 🎵")
            song_dropdown = gr.Dropdown(label="Pick one of 5 songs:", choices=get_songs())
            album_image = gr.Image(label="Cover Image", height=300)
            original_mix_audio = gr.Audio(label="Original Mix (Vocals + Instrumentals)", interactive=False)
            song_dropdown.change(fn=update_song_features, inputs=song_dropdown, outputs=[original_mix_audio, album_image])

        with gr.Column():
            gr.Markdown("#### Step 2: Select Your Voice to Emulate 👩🏻")
            person_dropdown = gr.Dropdown(label="Select one of 15 voices:", choices=get_people())
            person_image = gr.Image(label="Person", height=300)
            person_description_text = gr.Textbox(label="Description:")
            person_dropdown.change(update_person_details, inputs=person_dropdown, outputs=[person_image, person_description_text])
            
            convert_button = gr.Button("Convert")

        with gr.Column():
            gr.Markdown("#### Your Custom AI Vocal - Unveiled! 🤖")
            converted_mix_audio = gr.Audio(label="Converted Mix (Converted Vocals + Instrumentals)", type="filepath")
            convert_button.click(voice_conversion, inputs=[person_dropdown, song_dropdown], outputs=[converted_mix_audio])

            generate_button = gr.Button("Generate Social Media Post")
            generate_button.click(gen_caption_image, inputs=[song_dropdown, person_dropdown], outputs=[gr.Image(label="Generated Image", type="numpy", height=300), gr.Textbox(label="Generated Caption")])

if __name__ == "__main__":
    app.launch(share = True)