krupal19 commited on
Commit
56ada58
·
verified ·
1 Parent(s): 65f48d0

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +37 -692
app.py CHANGED
@@ -1,706 +1,51 @@
 
 
1
  import torch
2
- torch.serialization.add_safe_globals(["TTS.tts.configs.xtts_config.XttsConfig"])
3
-
4
- import sys
5
- import io, os, stat
6
- import subprocess
7
- import random
8
- from zipfile import ZipFile
9
- import uuid
10
- import time
11
- import torch
12
- import torchaudio
13
 
 
 
 
 
 
 
 
14
 
15
- #download for mecab
16
- os.system('python -m unidic download')
17
-
18
- # By using XTTS you agree to CPML license https://coqui.ai/cpml
19
  os.environ["COQUI_TOS_AGREED"] = "1"
20
 
21
- # langid is used to detect language for longer text
22
- # Most users expect text to be their own language, there is checkbox to disable it
23
- import langid
24
- import base64
25
- import csv
26
- from io import StringIO
27
- import datetime
28
- import re
29
-
30
- import gradio as gr
31
- from scipy.io.wavfile import write
32
- from pydub import AudioSegment
33
-
34
  from TTS.api import TTS
35
- from TTS.tts.configs.xtts_config import XttsConfig
36
- from TTS.tts.models.xtts import Xtts
37
- from TTS.utils.generic_utils import get_user_data_dir
38
-
39
- HF_TOKEN = os.environ.get("HF_TOKEN")
40
-
41
- from huggingface_hub import HfApi
42
-
43
- # will use api to restart space on a unrecoverable error
44
- api = HfApi(token=HF_TOKEN)
45
- repo_id = "coqui/xtts"
46
-
47
- # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
48
- print("Export newer ffmpeg binary for denoise filter")
49
- ZipFile("ffmpeg.zip").extractall()
50
- print("Make ffmpeg binary executable")
51
- st = os.stat("ffmpeg")
52
- os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
53
-
54
- # This will trigger downloading model
55
- print("Downloading if not downloaded Coqui XTTS V2")
56
- from TTS.utils.manage import ModelManager
57
-
58
- model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
59
- ModelManager().download_model(model_name)
60
- model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
61
- print("XTTS downloaded")
62
-
63
- config = XttsConfig()
64
- config.load_json(os.path.join(model_path, "config.json"))
65
-
66
- model = Xtts.init_from_config(config)
67
- model.load_checkpoint(
68
- config,
69
- checkpoint_path=os.path.join(model_path, "model.pth"),
70
- vocab_path=os.path.join(model_path, "vocab.json"),
71
- eval=True,
72
- use_deepspeed=True,
73
- )
74
- model.cuda()
75
-
76
- # This is for debugging purposes only
77
- DEVICE_ASSERT_DETECTED = 0
78
- DEVICE_ASSERT_PROMPT = None
79
- DEVICE_ASSERT_LANG = None
80
-
81
- supported_languages = config.languages
82
-
83
- def predict(
84
- prompt,
85
- language,
86
- audio_file_pth,
87
- mic_file_path,
88
- use_mic,
89
- voice_cleanup,
90
- no_lang_auto_detect,
91
- agree,
92
- ):
93
- if agree == True:
94
- if language not in supported_languages:
95
- gr.Warning(
96
- f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
97
- )
98
-
99
- return (
100
- None,
101
- None,
102
- None,
103
- None,
104
- )
105
-
106
- language_predicted = langid.classify(prompt)[
107
- 0
108
- ].strip() # strip need as there is space at end!
109
-
110
- # tts expects chinese as zh-cn
111
- if language_predicted == "zh":
112
- # we use zh-cn
113
- language_predicted = "zh-cn"
114
-
115
- print(f"Detected language:{language_predicted}, Chosen language:{language}")
116
-
117
- # After text character length 15 trigger language detection
118
- if len(prompt) > 15:
119
- # allow any language for short text as some may be common
120
- # If user unchecks language autodetection it will not trigger
121
- # You may remove this completely for own use
122
- if language_predicted != language and not no_lang_auto_detect:
123
- # Please duplicate and remove this check if you really want this
124
- # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
125
- gr.Warning(
126
- f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
127
- )
128
-
129
- return (
130
- None,
131
- None,
132
- None,
133
- None,
134
- )
135
-
136
- if use_mic == True:
137
- if mic_file_path is not None:
138
- speaker_wav = mic_file_path
139
- else:
140
- gr.Warning(
141
- "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
142
- )
143
- return (
144
- None,
145
- None,
146
- None,
147
- None,
148
- )
149
-
150
- else:
151
- speaker_wav = audio_file_pth
152
-
153
- # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
154
- # This is fast filtering not perfect
155
-
156
- # Apply all on demand
157
- lowpassfilter = denoise = trim = loudness = True
158
-
159
- if lowpassfilter:
160
- lowpass_highpass = "lowpass=8000,highpass=75,"
161
- else:
162
- lowpass_highpass = ""
163
-
164
- if trim:
165
- # better to remove silence in beginning and end for microphone
166
- trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
167
- else:
168
- trim_silence = ""
169
-
170
- if voice_cleanup:
171
- try:
172
- out_filename = (
173
- speaker_wav + str(uuid.uuid4()) + ".wav"
174
- ) # ffmpeg to know output format
175
-
176
- # we will use newer ffmpeg as that has afftn denoise filter
177
- shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
178
- " "
179
- )
180
-
181
- command_result = subprocess.run(
182
- [item for item in shell_command],
183
- capture_output=False,
184
- text=True,
185
- check=True,
186
- )
187
- speaker_wav = out_filename
188
- print("Filtered microphone input")
189
- except subprocess.CalledProcessError:
190
- # There was an error - command exited with non-zero code
191
- print("Error: failed filtering, use original microphone input")
192
- else:
193
- speaker_wav = speaker_wav
194
-
195
- if len(prompt) < 2:
196
- gr.Warning("Please give a longer prompt text")
197
- return (
198
- None,
199
- None,
200
- None,
201
- None,
202
- )
203
- if len(prompt) > 200:
204
- gr.Warning(
205
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
206
- )
207
- return (
208
- None,
209
- None,
210
- None,
211
- None,
212
- )
213
- global DEVICE_ASSERT_DETECTED
214
- if DEVICE_ASSERT_DETECTED:
215
- global DEVICE_ASSERT_PROMPT
216
- global DEVICE_ASSERT_LANG
217
- # It will likely never come here as we restart space on first unrecoverable error now
218
- print(
219
- f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
220
- )
221
 
222
- # HF Space specific.. This error is unrecoverable need to restart space
223
- space = api.get_space_runtime(repo_id=repo_id)
224
- if space.stage!="BUILDING":
225
- api.restart_space(repo_id=repo_id)
226
- else:
227
- print("TRIED TO RESTART but space is building")
228
 
229
- try:
230
- metrics_text = ""
231
- t_latent = time.time()
232
 
233
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
234
- try:
235
- (
236
- gpt_cond_latent,
237
- speaker_embedding,
238
- ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
239
- except Exception as e:
240
- print("Speaker encoding error", str(e))
241
- gr.Warning(
242
- "It appears something wrong with reference, did you unmute your microphone?"
243
- )
244
- return (
245
- None,
246
- None,
247
- None,
248
- None,
249
- )
250
 
251
- latent_calculation_time = time.time() - t_latent
252
- # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
 
 
 
 
253
 
254
- # temporary comma fix
255
- prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
256
 
257
- wav_chunks = []
258
- ## Direct mode
259
-
260
- print("I: Generating new audio...")
261
- t0 = time.time()
262
- out = model.inference(
263
- prompt,
264
- language,
265
- gpt_cond_latent,
266
- speaker_embedding,
267
- repetition_penalty=5.0,
268
- temperature=0.75,
269
- )
270
- inference_time = time.time() - t0
271
- print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
272
- metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
273
- real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
274
- print(f"Real-time factor (RTF): {real_time_factor}")
275
- metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
276
- torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
277
 
278
-
279
- """
280
- print("I: Generating new audio in streaming mode...")
281
- t0 = time.time()
282
- chunks = model.inference_stream(
283
- prompt,
284
- language,
285
- gpt_cond_latent,
286
- speaker_embedding,
287
- repetition_penalty=7.0,
288
- temperature=0.85,
289
- )
290
-
291
- first_chunk = True
292
- for i, chunk in enumerate(chunks):
293
- if first_chunk:
294
- first_chunk_time = time.time() - t0
295
- metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
296
- first_chunk = False
297
- wav_chunks.append(chunk)
298
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
299
- inference_time = time.time() - t0
300
- print(
301
- f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
302
- )
303
- #metrics_text += (
304
- # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
305
- #)
306
-
307
- wav = torch.cat(wav_chunks, dim=0)
308
- print(wav.shape)
309
- real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
310
- print(f"Real-time factor (RTF): {real_time_factor}")
311
- metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
312
-
313
- torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
314
- """
315
-
316
- except RuntimeError as e:
317
- if "device-side assert" in str(e):
318
- # cannot do anything on cuda device side error, need tor estart
319
- print(
320
- f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
321
- flush=True,
322
- )
323
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
324
- print("Cuda device-assert Runtime encountered need restart")
325
- if not DEVICE_ASSERT_DETECTED:
326
- DEVICE_ASSERT_DETECTED = 1
327
- DEVICE_ASSERT_PROMPT = prompt
328
- DEVICE_ASSERT_LANG = language
329
-
330
- # just before restarting save what caused the issue so we can handle it in future
331
- # Uploading Error data only happens for unrecovarable error
332
- error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
333
- error_data = [
334
- error_time,
335
- prompt,
336
- language,
337
- audio_file_pth,
338
- mic_file_path,
339
- use_mic,
340
- voice_cleanup,
341
- no_lang_auto_detect,
342
- agree,
343
- ]
344
- error_data = [str(e) if type(e) != str else e for e in error_data]
345
- print(error_data)
346
- print(speaker_wav)
347
- write_io = StringIO()
348
- csv.writer(write_io).writerows([error_data])
349
- csv_upload = write_io.getvalue().encode()
350
-
351
- filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
352
- print("Writing error csv")
353
- error_api = HfApi()
354
- error_api.upload_file(
355
- path_or_fileobj=csv_upload,
356
- path_in_repo=filename,
357
- repo_id="coqui/xtts-flagged-dataset",
358
- repo_type="dataset",
359
- )
360
-
361
- # speaker_wav
362
- print("Writing error reference audio")
363
- speaker_filename = (
364
- error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
365
- )
366
- error_api = HfApi()
367
- error_api.upload_file(
368
- path_or_fileobj=speaker_wav,
369
- path_in_repo=speaker_filename,
370
- repo_id="coqui/xtts-flagged-dataset",
371
- repo_type="dataset",
372
- )
373
-
374
- # HF Space specific.. This error is unrecoverable need to restart space
375
- space = api.get_space_runtime(repo_id=repo_id)
376
- if space.stage!="BUILDING":
377
- api.restart_space(repo_id=repo_id)
378
- else:
379
- print("TRIED TO RESTART but space is building")
380
 
381
- else:
382
- if "Failed to decode" in str(e):
383
- print("Speaker encoding error", str(e))
384
- gr.Warning(
385
- "It appears something wrong with reference, did you unmute your microphone?"
386
- )
387
- else:
388
- print("RuntimeError: non device-side assert error:", str(e))
389
- gr.Warning("Something unexpected happened please retry again.")
390
- return (
391
- None,
392
- None,
393
- None,
394
- None,
395
- )
396
- return (
397
- gr.make_waveform(
398
- audio="output.wav",
399
- ),
400
- "output.wav",
401
- metrics_text,
402
- speaker_wav,
403
- )
404
- else:
405
- gr.Warning("Please accept the Terms & Condition!")
406
- return (
407
- None,
408
- None,
409
- None,
410
- None,
411
- )
412
-
413
-
414
- title = "Coqui🐸 XTTS"
415
-
416
- description = """
417
-
418
- <br/>
419
-
420
- This demo is currently running **XTTS v2.0.3** <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning, however, you can fine-tune XTTS for better results. Leave a star 🌟 on Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
421
-
422
- <br/>
423
-
424
- Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
425
-
426
- <br/>
427
- """
428
-
429
- links = """
430
- <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
431
-
432
- | | |
433
- | ------------------------------- | --------------------------------------- |
434
- | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
435
- | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
436
- | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
437
- | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
438
-
439
-
440
- """
441
-
442
- article = """
443
- <div style='margin:20px auto;'>
444
- <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
445
- <p>We collect data only for error cases for improvement.</p>
446
- </div>
447
- """
448
- examples = [
449
- [
450
- "Once when I was six years old I saw a magnificent picture",
451
- "en",
452
- "examples/female.wav",
453
- None,
454
- False,
455
- False,
456
- False,
457
- True,
458
- ],
459
- [
460
- "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
461
- "fr",
462
- "examples/male.wav",
463
- None,
464
- False,
465
- False,
466
- False,
467
- True,
468
- ],
469
- [
470
- "Als ich sechs war, sah ich einmal ein wunderbares Bild",
471
- "de",
472
- "examples/female.wav",
473
- None,
474
- False,
475
- False,
476
- False,
477
- True,
478
- ],
479
- [
480
- "Cuando tenía seis años, vi una vez una imagen magnífica",
481
- "es",
482
- "examples/male.wav",
483
- None,
484
- False,
485
- False,
486
- False,
487
- True,
488
- ],
489
- [
490
- "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
491
- "pt",
492
- "examples/female.wav",
493
- None,
494
- False,
495
- False,
496
- False,
497
- True,
498
- ],
499
- [
500
- "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
501
- "pl",
502
- "examples/male.wav",
503
- None,
504
- False,
505
- False,
506
- False,
507
- True,
508
- ],
509
- [
510
- "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
511
- "it",
512
- "examples/female.wav",
513
- None,
514
- False,
515
- False,
516
- False,
517
- True,
518
- ],
519
- [
520
- "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
521
- "tr",
522
- "examples/female.wav",
523
- None,
524
- False,
525
- False,
526
- False,
527
- True,
528
- ],
529
- [
530
- "Когда мне было шесть лет, я увидел однажды удивительную картинку",
531
- "ru",
532
- "examples/female.wav",
533
- None,
534
- False,
535
- False,
536
- False,
537
- True,
538
- ],
539
- [
540
- "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
541
- "nl",
542
- "examples/male.wav",
543
- None,
544
- False,
545
- False,
546
- False,
547
- True,
548
- ],
549
- [
550
- "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
551
- "cs",
552
- "examples/female.wav",
553
- None,
554
- False,
555
- False,
556
- False,
557
- True,
558
- ],
559
- [
560
- "当我还只有六岁的时候, 看到了一副精彩的插画",
561
- "zh-cn",
562
- "examples/female.wav",
563
- None,
564
- False,
565
- False,
566
- False,
567
- True,
568
- ],
569
- [
570
- "かつて 六歳のとき、素晴らしい絵を見ました",
571
- "ja",
572
- "examples/female.wav",
573
- None,
574
- False,
575
- True,
576
- False,
577
- True,
578
- ],
579
- [
580
- "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
581
- "ko",
582
- "examples/female.wav",
583
- None,
584
- False,
585
- True,
586
- False,
587
- True,
588
- ],
589
- [
590
- "Egyszer hat éves koromban láttam egy csodálatos képet",
591
- "hu",
592
- "examples/male.wav",
593
- None,
594
- False,
595
- True,
596
- False,
597
- True,
598
- ],
599
- ]
600
-
601
-
602
-
603
- with gr.Blocks(analytics_enabled=False) as demo:
604
- with gr.Row():
605
- with gr.Column():
606
- gr.Markdown(
607
- """
608
- ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
609
- """
610
- )
611
- with gr.Column():
612
- # placeholder to align the image
613
- pass
614
-
615
- with gr.Row():
616
- with gr.Column():
617
- gr.Markdown(description)
618
- with gr.Column():
619
- gr.Markdown(links)
620
-
621
- with gr.Row():
622
- with gr.Column():
623
- input_text_gr = gr.Textbox(
624
- label="Text Prompt",
625
- info="One or two sentences at a time is better. Up to 200 text characters.",
626
- value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
627
- )
628
- language_gr = gr.Dropdown(
629
- label="Language",
630
- info="Select an output language for the synthesised speech",
631
- choices=[
632
- "en",
633
- "es",
634
- "fr",
635
- "de",
636
- "it",
637
- "pt",
638
- "pl",
639
- "tr",
640
- "ru",
641
- "nl",
642
- "cs",
643
- "ar",
644
- "zh-cn",
645
- "ja",
646
- "ko",
647
- "hu",
648
- "hi"
649
- ],
650
- max_choices=1,
651
- value="en",
652
- )
653
- ref_gr = gr.Audio(
654
- label="Reference Audio",
655
- info="Click on the ✎ button to upload your own target speaker audio",
656
- type="filepath",
657
- value="examples/female.wav",
658
- )
659
- mic_gr = gr.Audio(
660
- source="microphone",
661
- type="filepath",
662
- info="Use your microphone to record audio",
663
- label="Use Microphone for Reference",
664
- )
665
- use_mic_gr = gr.Checkbox(
666
- label="Use Microphone",
667
- value=False,
668
- info="Notice: Microphone input may not work properly under traffic",
669
- )
670
- clean_ref_gr = gr.Checkbox(
671
- label="Cleanup Reference Voice",
672
- value=False,
673
- info="This check can improve output if your microphone or reference voice is noisy",
674
- )
675
- auto_det_lang_gr = gr.Checkbox(
676
- label="Do not use language auto-detect",
677
- value=False,
678
- info="Check to disable language auto-detection",
679
- )
680
- tos_gr = gr.Checkbox(
681
- label="Agree",
682
- value=False,
683
- info="I agree to the terms of the CPML: https://coqui.ai/cpml",
684
- )
685
-
686
- tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
687
-
688
-
689
- with gr.Column():
690
- video_gr = gr.Video(label="Waveform Visual")
691
- audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
692
- out_text_gr = gr.Text(label="Metrics")
693
- ref_audio_gr = gr.Audio(label="Reference Audio Used")
694
-
695
- with gr.Row():
696
- gr.Examples(examples,
697
- label="Examples",
698
- inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
699
- outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
700
- fn=predict,
701
- cache_examples=False,)
702
-
703
- tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
704
-
705
- demo.queue()
706
- demo.launch(debug=True, show_api=True)
 
1
+ import spaces
2
+ import gradio as gr
3
  import torch
4
+ import os
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # torch >= 2.6 defaults torch.load(weights_only=True), which breaks TTS's
7
+ # pickle-based config loading. Restore the older behavior before TTS imports.
8
+ _orig_torch_load = torch.load
9
+ def _patched_torch_load(*args, **kwargs):
10
+ kwargs.setdefault("weights_only", False)
11
+ return _orig_torch_load(*args, **kwargs)
12
+ torch.load = _patched_torch_load
13
 
 
 
 
 
14
  os.environ["COQUI_TOS_AGREED"] = "1"
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from TTS.api import TTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ device = "cuda"
 
 
 
 
 
19
 
20
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 
 
21
 
22
+ @spaces.GPU
23
+ def clone(text, audio):
24
+ tts.tts_to_file(text=text, speaker_wav=audio, language="en", file_path="./output.wav")
25
+ return "./output.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ iface = gr.Interface(fn=clone,
28
+ inputs=[gr.Textbox(label='Text'),gr.Audio(type='filepath', label='Voice reference audio file')],
29
+ outputs=gr.Audio(type='filepath'),
30
+ title='Voice Clone',
31
+ description="""
32
+ by [Tony Assi](https://www.tonyassi.com/)
33
 
34
+ ---
 
35
 
36
+ <h3>If you like voice clone then try <a href="https://huggingface.co/spaces/tonyassi/video-face-swap" target="_blank" rel="noopener noreferrer">Video Face Swap</a></h3>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ This space uses xtts_v2 model. Non-commercial use only. [Coqui Public Model License](https://huggingface.co/coqui/XTTS-v2/blob/main/LICENSE.txt)
41
+
42
+ Please ❤️ this Space. [Email me](mailto:tony.assi.media@gmail.com).
43
+ """,
44
+ theme = gr.themes.Base(primary_hue="teal",secondary_hue="teal",neutral_hue="slate"),
45
+ examples=[["Hey! It's me Dorthy, from the Wizard of Oz. Type in whatever you'd like me to say.","./audio/Wizard-of-Oz-Dorthy.wav"],
46
+ ["It's me Vito Corleone, from the Godfather. Type in whatever you'd like me to say.","./audio/Godfather.wav"],
47
+ ["Hey, it's me Paris Hilton. Type in whatever you'd like me to say.","./audio/Paris-Hilton.mp3"],
48
+ ["Hey, it's me Megan Fox from Transformers. Type in whatever you'd like me to say.","./audio/Megan-Fox.mp3"],
49
+ ["Hey there, it's me Jeff Goldblum. Type in whatever you'd like me to say.","./audio/Jeff-Goldblum.mp3"],
50
+ ["Hey there, it's me Heath Ledger as the Joker. Type in whatever you'd like me to say.","./audio/Heath-Ledger.mp3"],])
51
+ iface.launch()