|
|
from tts import TTS_object |
|
|
import soundfile as sf |
|
|
import gradio as gr |
|
|
import subprocess |
|
|
import random |
|
|
import torch |
|
|
import os |
|
|
import re |
|
|
|
|
|
|
|
|
BASE_PATH = os.path.dirname(__file__) |
|
|
MODEL_PATH_v1 = os.path.join(BASE_PATH, "xtts_v1.1") |
|
|
MODEL_PATH_v2 = os.path.join(BASE_PATH, "xtts_v2") |
|
|
OUTPUTS_PATH = os.path.join(BASE_PATH, "outputs") |
|
|
REFERENCES_PATH = os.path.join(BASE_PATH, "references") |
|
|
|
|
|
HTML_BUTTON = """ |
|
|
</br> |
|
|
<div style="text-align: center;"> |
|
|
<button type="button" onclick="alert(\'{}\')" style="padding: 10px 20px; |
|
|
font-size: 16px; background-color: #4CAF50; color: white; |
|
|
border: none; cursor: pointer; border-radius: 4px;"> |
|
|
Expose model names |
|
|
</button> |
|
|
</div>""" |
|
|
|
|
|
models = { |
|
|
"xtts_v2_wajeez": TTS_object("models", torch.device('cuda:0')) |
|
|
} |
|
|
|
|
|
MODELS_COUNT = len(models) |
|
|
|
|
|
def predict(text, speaker): |
|
|
reference_file_path = os.path.join(REFERENCES_PATH, speaker + ".wav") |
|
|
|
|
|
output_paths = [] |
|
|
for model_name, model in models.items(): |
|
|
wav = model.inference(text, reference_file_path) |
|
|
path = os.path.join(OUTPUTS_PATH, model_name + ".wav") |
|
|
sf.write(path, wav, 24000) |
|
|
output_paths.append(path) |
|
|
|
|
|
random.shuffle(output_paths) |
|
|
actual_models = '\\n'.join([f"- The model number {i + 1} is {path.split('/')[-1][:-4]}" for i, path in enumerate(output_paths)]) |
|
|
return (text, *output_paths, HTML_BUTTON.format(actual_models)) |
|
|
|
|
|
|
|
|
|
|
|
speakers = ["Wajeez"] |
|
|
|
|
|
playground = gr.Interface( |
|
|
fn = predict, |
|
|
inputs = [ |
|
|
gr.Textbox( |
|
|
value = "مرحبا كيف حالك؟", |
|
|
label = "Input text", |
|
|
info = "One or two sentences at a time is better. Up to 200 text characters." |
|
|
), |
|
|
gr.Dropdown( |
|
|
speakers, |
|
|
value="Wajeez", |
|
|
label = "Speaker / Reference source", |
|
|
info = "Choose your speaker or choose to upload / record a new speaker." |
|
|
), |
|
|
], |
|
|
outputs = [gr.Textbox( |
|
|
label = "Synthesized text", |
|
|
info = "The text used as input after preprocessing is done (if any)." |
|
|
)] + [gr.components.Audio(label = f'Model {i + 1}', type = 'filepath') for i in range(MODELS_COUNT)] + [gr.HTML()], |
|
|
cache_examples = False, |
|
|
allow_flagging = 'never' |
|
|
) |
|
|
|
|
|
playground.launch() |