File size: 2,353 Bytes
84a09a7
39a0c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20e6dc8
39a0c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0abfc2
39a0c0a
 
 
 
 
 
 
 
 
 
 
f0abfc2
39a0c0a
 
 
 
 
 
 
 
 
 
 
 
e2d41fa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from tts import TTS_object
import soundfile as sf
import gradio as gr
import subprocess
import random
import torch
import os
import re

# Preparing paths
BASE_PATH = os.path.dirname(__file__)
MODEL_PATH_v1 = os.path.join(BASE_PATH, "xtts_v1.1")
MODEL_PATH_v2 = os.path.join(BASE_PATH, "xtts_v2")
OUTPUTS_PATH = os.path.join(BASE_PATH, "outputs")
REFERENCES_PATH = os.path.join(BASE_PATH, "references")

HTML_BUTTON = """
</br>
<div style="text-align: center;">
    <button type="button" onclick="alert(\'{}\')" style="padding: 10px 20px;
            font-size: 16px; background-color: #4CAF50; color: white;
            border: none; cursor: pointer; border-radius: 4px;">
        Expose model names
    </button>
</div>"""

models = {
    "xtts_v2_wajeez": TTS_object("models", torch.device('cuda:0'))
}

MODELS_COUNT = len(models)

def predict(text, speaker):
    reference_file_path = os.path.join(REFERENCES_PATH, speaker + ".wav")

    output_paths = []
    for model_name, model in models.items():
        wav = model.inference(text, reference_file_path)
        path = os.path.join(OUTPUTS_PATH, model_name + ".wav")
        sf.write(path, wav, 24000)
        output_paths.append(path)
    
    random.shuffle(output_paths)
    actual_models = '\\n'.join([f"- The model number {i + 1} is {path.split('/')[-1][:-4]}" for i, path in enumerate(output_paths)])
    return (text, *output_paths, HTML_BUTTON.format(actual_models))


# Get speakers from references path to prepare the speakers list
speakers = ["Wajeez"]

playground = gr.Interface(
    fn = predict,
    inputs = [
        gr.Textbox(
            value = "مرحبا كيف حالك؟",
            label = "Input text",
            info = "One or two sentences at a time is better. Up to 200 text characters."
        ),
        gr.Dropdown(
            speakers,
            value="Wajeez",
            label = "Speaker / Reference source",
            info = "Choose your speaker or choose to upload / record a new speaker."
        ),
    ],
    outputs = [gr.Textbox(
        label = "Synthesized text",
        info = "The text used as input after preprocessing is done (if any)."
    )] + [gr.components.Audio(label = f'Model {i + 1}', type = 'filepath') for i in range(MODELS_COUNT)] + [gr.HTML()],
    cache_examples = False,
    allow_flagging = 'never'
)

playground.launch()