asbgig commited on
Commit
681b58a
·
verified ·
1 Parent(s): 13f3113

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import tempfile, os
4
+ import numpy as np
5
+ import soundfile as sf
6
+ from TTS.api import TTS
7
+
8
+ # If Coqui shows a CPML prompt when downloading models,
9
+ # try pre-accept via env var (safe no-op if ignored).
10
+ os.environ.setdefault("COQUI_TOS_AGREED", "y")
11
+
12
+ MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
13
+
14
+ # Lazy-load to avoid heavy import before Space is ready
15
+ _tts_obj = None
16
+ def get_tts():
17
+ global _tts_obj
18
+ if _tts_obj is None:
19
+ _tts_obj = TTS(MODEL_NAME)
20
+ return _tts_obj
21
+
22
+ LANGS = [
23
+ ("English", "en"), ("Urdu", "ur"), ("Hindi", "hi"), ("Arabic", "ar"),
24
+ ("French", "fr"), ("German", "de"), ("Spanish", "es"), ("Italian", "it"),
25
+ ("Portuguese", "pt"), ("Turkish", "tr"),
26
+ ]
27
+
28
+ def clean_text(text: str) -> str:
29
+ return " ".join((text or "").strip().split())
30
+
31
+ def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
32
+ try:
33
+ tts.tts_to_file(
34
+ text=txt, file_path=out_path,
35
+ speaker_wav=wav_path, language=lang, speed=speed,
36
+ )
37
+ except TypeError:
38
+ tts.tts_to_file(
39
+ text=txt, file_path=out_path,
40
+ speaker_wav=wav_path, language=lang,
41
+ )
42
+
43
+ def tts_clone(text, ref_audio, language_code, speed, split_sentences):
44
+ if ref_audio is None:
45
+ raise gr.Error("Please upload a reference voice sample (10–60 seconds).")
46
+ text = clean_text(text)
47
+ if not text:
48
+ raise gr.Error("Please enter some text.")
49
+
50
+ tts = get_tts()
51
+ wav_path = ref_audio
52
+
53
+ chunks = [text]
54
+ if split_sentences:
55
+ import re
56
+ chunks = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
57
+
58
+ out_wavs = []
59
+ with tempfile.TemporaryDirectory() as td:
60
+ for i, chunk in enumerate(chunks, 1):
61
+ out_path = os.path.join(td, f"part_{i}.wav")
62
+ synth_to_file_safe(tts, chunk, out_path, wav_path, language_code, speed)
63
+ data, sr = sf.read(out_path)
64
+ out_wavs.append((data, sr))
65
+
66
+ if len(out_wavs) == 1:
67
+ final_data, sr = out_wavs[0]
68
+ else:
69
+ sr = out_wavs[0][1]
70
+ final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
71
+
72
+ final_path = os.path.join(td, "output.wav")
73
+ sf.write(final_path, final_data, sr)
74
+ return final_path
75
+
76
+ with gr.Blocks(title="TalkClone - Voice Cloning & TTS", css="#warning{border-left:4px solid #22c55e;padding-left:8px;}") as demo:
77
+ gr.Markdown("# TalkClone — Turn Text into Speech using a Reference Audio")
78
+ gr.Markdown(
79
+ "Upload a short, clean **reference voice** (10–60s), pick a **language**, type your **text**, and generate audio. "
80
+ "For best results: no music/background noise, single speaker, 16kHz+ WAV/MP3."
81
+ )
82
+ with gr.Row():
83
+ with gr.Column():
84
+ ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
85
+ language = gr.Dropdown(choices=LANGS, value="en", label="Language")
86
+ text = gr.Textbox(label="Text", lines=5, placeholder="Type your text here...")
87
+ speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
88
+ split = gr.Checkbox(value=True, label="Auto split long text by sentence")
89
+ submit = gr.Button("Generate", variant="primary")
90
+ gr.Markdown(
91
+ '<div id="warning"><strong>Consent & Safety:</strong> Only clone voices you have explicit permission to use. '
92
+ "Avoid public-figure impersonation and disclose AI-generated audio when required by law.</div>"
93
+ )
94
+ with gr.Column():
95
+ output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
96
+ download = gr.File(label="Download audio")
97
+
98
+ def run_and_return(text, ref_audio, language, speed, split):
99
+ out_path = tts_clone(text, ref_audio, language, speed, split)
100
+ return out_path, out_path
101
+
102
+ submit.click(run_and_return, inputs=[text, ref_audio, language, speed, split],
103
+ outputs=[output, download])
104
+
105
+ if __name__ == "__main__":
106
+ demo.launch()