File size: 13,473 Bytes
ca90f09
f81d4f2
0db6209
f05e79d
0db6209
156829e
f81d4f2
 
 
156829e
64f96e7
 
 
2f7d9da
 
d02ad9c
156829e
48423a0
 
 
 
63c45d7
f89c55d
d02ad9c
f81d4f2
 
 
194fffd
f81d4f2
 
 
 
def995e
f81d4f2
def995e
f81d4f2
def995e
 
 
0db6209
 
 
156829e
 
0db6209
96324d6
285150b
156829e
96324d6
285150b
 
 
194fffd
f81d4f2
 
4584388
f81d4f2
 
 
 
 
 
156829e
f81d4f2
 
0db6209
156829e
 
 
def995e
156829e
48423a0
156829e
 
 
 
 
 
 
 
 
 
ee48acc
f89c55d
156829e
 
 
d3731c3
156829e
d3731c3
156829e
f89c55d
4584388
63c2202
 
156829e
f38c6b2
156829e
d3731c3
156829e
d3731c3
f89c55d
f74bce2
 
156829e
f74bce2
156829e
 
 
d3731c3
f74bce2
156829e
3b69bc5
156829e
 
3b69bc5
156829e
3b69bc5
156829e
3b69bc5
 
156829e
3b69bc5
156829e
 
 
3b69bc5
d3731c3
 
156829e
 
 
 
 
 
 
3b69bc5
 
 
 
156829e
f74bce2
156829e
f74bce2
d3731c3
 
 
 
156829e
d3731c3
156829e
d3731c3
 
def995e
 
 
 
d3731c3
5a8706b
 
 
 
 
 
156829e
 
 
 
48423a0
d3731c3
 
 
 
 
 
48423a0
6550ebd
d3731c3
 
156829e
f81d4f2
d3731c3
ddd9ee2
f81d4f2
 
 
 
 
 
c3816ad
 
 
f74bce2
f81d4f2
 
 
 
 
 
 
63c45d7
156829e
def995e
d3731c3
d6aee2b
 
def995e
156829e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48423a0
 
 
f7c2b84
156829e
 
 
48423a0
 
 
 
 
 
 
 
156829e
d3731c3
48423a0
 
 
 
 
 
 
 
5a8706b
 
 
 
 
 
d6aee2b
4e13633
 
d3731c3
4e13633
 
 
d3731c3
939c1fe
d3731c3
939c1fe
f81d4f2
3b69bc5
d636635
ee48acc
 
d3731c3
194fffd
d3731c3
194fffd
e1c65f1
533ef97
d3731c3
533ef97
d3731c3
e1c65f1
63c45d7
 
 
 
8d707c1
d3731c3
 
 
8d707c1
 
63c45d7
 
 
 
 
d3731c3
 
 
 
 
 
63c45d7
 
 
 
 
d3731c3
5c9b2a5
d3731c3
 
63c45d7
 
 
d3731c3
63c45d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3731c3
63c45d7
 
d3731c3
63c45d7
 
 
 
 
 
 
d3731c3
 
 
 
 
63c45d7
b040f35
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import sys
import io, os, stat
import subprocess
import random
from zipfile import ZipFile
import uuid
import time
import torch
import torchaudio

#download for mecab
os.system('python -m unidic download')

# By using XTTS you agree to CPML license https://coqui.ai/cpml
os.environ["COQUI_TOS_AGREED"] = "1"

import langid
import base64
import csv
from io import StringIO
import datetime
import re

import gradio as gr
from scipy.io.wavfile import write
from pydub import AudioSegment

from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir

HF_TOKEN = os.environ.get("HF_TOKEN")

from huggingface_hub import HfApi

api = HfApi(token=HF_TOKEN)
repo_id = "coqui/xtts"

print("Export newer ffmpeg binary for denoise filter")
ZipFile("ffmpeg.zip").extractall()
print("Make ffmpeg binary executable")
st = os.stat("ffmpeg")
os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)

print("Downloading if not downloaded Coqui XTTS V2")
from TTS.utils.manage import ModelManager

model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
print("XTTS downloaded")

config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))

model = Xtts.init_from_config(config)
model.load_checkpoint(
    config,
    checkpoint_path=os.path.join(model_path, "model.pth"),
    vocab_path=os.path.join(model_path, "vocab.json"),
    eval=True,
    use_deepspeed=True,
)
model.cuda()

DEVICE_ASSERT_DETECTED = 0
DEVICE_ASSERT_PROMPT = None
DEVICE_ASSERT_LANG = None

supported_languages = config.languages

def predict(
    prompt,
    language,
    audio_file_pth,
    mic_file_path,
    use_mic,
    voice_cleanup,
    no_lang_auto_detect,
    agree,
):
    if agree == True:
        if language not in supported_languages:
            gr.Warning(
                f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
            )
            return (None, None, None, None)

        language_predicted = langid.classify(prompt)[0].strip()
        if language_predicted == "zh":
            language_predicted = "zh-cn"

        print(f"Detected language:{language_predicted}, Chosen language:{language}")

        if len(prompt) > 15:
            if language_predicted != language and not no_lang_auto_detect:
                gr.Warning(
                    f"It looks like your text isn't the language you chose, if you're sure the text is the same language you chose, please check disable language auto-detection checkbox"
                )
                return (None, None, None, None)

        if use_mic == True:
            if mic_file_path is not None:
                speaker_wav = mic_file_path
            else:
                gr.Warning(
                    "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
                )
                return (None, None, None, None)
        else:
            speaker_wav = audio_file_pth

        lowpassfilter = denoise = trim = loudness = True

        if lowpassfilter:
            lowpass_highpass = "lowpass=8000,highpass=75,"
        else:
            lowpass_highpass = ""

        if trim:
            trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
        else:
            trim_silence = ""

        if voice_cleanup:
            try:
                out_filename = speaker_wav + str(uuid.uuid4()) + ".wav"
                shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
                command_result = subprocess.run(
                    [item for item in shell_command],
                    capture_output=False,
                    text=True,
                    check=True,
                )
                speaker_wav = out_filename
                print("Filtered microphone input")
            except subprocess.CalledProcessError:
                print("Error: failed filtering, use original microphone input")
        else:
            speaker_wav = speaker_wav

        if len(prompt) < 2:
            gr.Warning("Please give a longer prompt text")
            return (None, None, None, None)
            
        # Changed from 200 to 5000 characters
        if len(prompt) > 5000:
            gr.Warning(
                "Text length limited to 5000 characters for this demo"
            )
            return (None, None, None, None)

        global DEVICE_ASSERT_DETECTED
        if DEVICE_ASSERT_DETECTED:
            global DEVICE_ASSERT_PROMPT
            global DEVICE_ASSERT_LANG
            print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
            space = api.get_space_runtime(repo_id=repo_id)
            if space.stage!="BUILDING":
                api.restart_space(repo_id=repo_id)
            else:
                print("TRIED TO RESTART but space is building")

        try:
            metrics_text = ""
            t_latent = time.time()

            try:
                (gpt_cond_latent, speaker_embedding) = model.get_conditioning_latents(
                    audio_path=speaker_wav, 
                    gpt_cond_len=30, 
                    gpt_cond_chunk_len=4, 
                    max_ref_length=60
                )
            except Exception as e:
                print("Speaker encoding error", str(e))
                gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
                return (None, None, None, None)

            latent_calculation_time = time.time() - t_latent
            prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)

            print("I: Generating new audio...")
            t0 = time.time()
            out = model.inference(
                prompt,
                language,
                gpt_cond_latent,
                speaker_embedding,
                repetition_penalty=5.0,
                temperature=0.75,
            )
            inference_time = time.time() - t0
            print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
            metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
            real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
            print(f"Real-time factor (RTF): {real_time_factor}")
            metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
            torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)

        except RuntimeError as e:
            if "device-side assert" in str(e):
                print(f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}", flush=True)
                gr.Warning("Unhandled Exception encounter, please retry in a minute")
                print("Cuda device-assert Runtime encountered need restart")
                if not DEVICE_ASSERT_DETECTED:
                    DEVICE_ASSERT_DETECTED = 1
                    DEVICE_ASSERT_PROMPT = prompt
                    DEVICE_ASSERT_LANG = language

                error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
                error_data = [
                    error_time,
                    prompt,
                    language,
                    audio_file_pth,
                    mic_file_path,
                    use_mic,
                    voice_cleanup,
                    no_lang_auto_detect,
                    agree,
                ]
                error_data = [str(e) if type(e) != str else e for e in error_data]
                print(error_data)
                print(speaker_wav)
                write_io = StringIO()
                csv.writer(write_io).writerows([error_data])
                csv_upload = write_io.getvalue().encode()

                filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
                print("Writing error csv")
                error_api = HfApi()
                error_api.upload_file(
                    path_or_fileobj=csv_upload,
                    path_in_repo=filename,
                    repo_id="coqui/xtts-flagged-dataset",
                    repo_type="dataset",
                )

                speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
                error_api = HfApi()
                error_api.upload_file(
                    path_or_fileobj=speaker_wav,
                    path_in_repo=speaker_filename,
                    repo_id="coqui/xtts-flagged-dataset",
                    repo_type="dataset",
                )

                space = api.get_space_runtime(repo_id=repo_id)
                if space.stage!="BUILDING":
                    api.restart_space(repo_id=repo_id)
                else:
                    print("TRIED TO RESTART but space is building")
                    
            else:
                if "Failed to decode" in str(e):
                    print("Speaker encoding error", str(e))
                    gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
                else:
                    print("RuntimeError: non device-side assert error:", str(e))
                    gr.Warning("Something unexpected happened please retry again.")
                return (None, None, None, None)
        return (
            gr.make_waveform(audio="output.wav"),
            "output.wav",
            metrics_text,
            speaker_wav,
        )
    else:
        gr.Warning("Please accept the Terms & Condition!")
        return (None, None, None, None)

title = "Coqui🐸 XTTS (5000 Char Limit)"

description = """
<br/>
This demo is running **XTTS v2.0.3** with 5000 character limit. <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech model with voice cloning. 
<br/>
Supported languages: Arabic (ar), Portuguese (pt), Chinese (zh-cn), Czech (cs), Dutch (nl), English (en), French (fr), German (de), Italian (it), Polish (pl), Russian (ru), Spanish (es), Turkish (tr), Japanese (ja), Korean (ko), Hungarian (hu), Hindi (hi)
<br/>
"""

with gr.Blocks(analytics_enabled=False) as demo:
    with gr.Row():
        with gr.Column():
            gr.Markdown("""
            ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
            """)
        with gr.Column():
            pass

    with gr.Row():
        with gr.Column():
            gr.Markdown(description)
        with gr.Column():
            gr.Markdown("""
            |                                 |                                         |
            | ------------------------------- | --------------------------------------- |
            | 🐸💬 **CoquiTTS**                | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
            | 💼 **Documentation**            | [ReadTheDocs](https://tts.readthedocs.io/en/latest/) |
            """)

    with gr.Row():
        with gr.Column():
            input_text_gr = gr.Textbox(
                label="Text Prompt",
                info="Up to 5000 text characters.",
                value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
                lines=5,
                max_lines=10
            )
            language_gr = gr.Dropdown(
                label="Language",
                choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"],
                value="en",
            )
            ref_gr = gr.Audio(
                label="Reference Audio",
                type="filepath",
                value="examples/female.wav",
            )
            mic_gr = gr.Audio(
                source="microphone",
                type="filepath",
                label="Use Microphone for Reference",
            )
            use_mic_gr = gr.Checkbox(
                label="Use Microphone",
                value=False,
            )
            clean_ref_gr = gr.Checkbox(
                label="Cleanup Reference Voice",
                value=False,
            )
            auto_det_lang_gr = gr.Checkbox(
                label="Do not use language auto-detect",
                value=False,
            )
            tos_gr = gr.Checkbox(
                label="Agree to CPML terms",
                value=False,
            )
            tts_button = gr.Button("Generate Speech", elem_id="send-btn", visible=True)

        with gr.Column():
            video_gr = gr.Video(label="Waveform Visual")
            audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
            out_text_gr = gr.Text(label="Metrics")
            ref_audio_gr = gr.Audio(label="Reference Audio Used")

    tts_button.click(
        predict, 
        [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], 
        outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr]
    )

demo.queue()  
demo.launch(debug=True, show_api=True)