File size: 6,743 Bytes
50f1b46
bf4353b
50f1b46
681b58a
 
a4b0424
bf4353b
 
 
681b58a
 
8d9fcd0
9aaaf3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d9fcd0
 
 
 
a4b0424
 
 
 
 
 
 
8d9fcd0
 
 
 
a4b0424
 
 
 
 
 
 
 
 
681b58a
a4b0424
 
681b58a
a4b0424
681b58a
1c8e78d
 
681b58a
1c8e78d
 
bf4353b
50f1b46
 
 
 
 
 
8d9fcd0
bf4353b
8d9fcd0
681b58a
 
 
8d9fcd0
 
 
 
681b58a
8d9fcd0
bf4353b
 
50f1b46
8d9fcd0
 
 
 
 
 
 
2c102d1
a4b0424
bf4353b
681b58a
8d9fcd0
681b58a
8d9fcd0
a4b0424
8d9fcd0
a4b0424
681b58a
 
50f1b46
9aaaf3c
 
 
 
 
bf4353b
50f1b46
 
 
 
 
9aaaf3c
50f1b46
 
 
 
 
 
 
 
 
681b58a
8d9fcd0
 
 
50f1b46
 
8d9fcd0
 
 
50f1b46
8d9fcd0
 
50f1b46
8d9fcd0
 
 
 
2c102d1
 
50f1b46
8d9fcd0
 
 
9aaaf3c
8d9fcd0
 
50f1b46
 
 
 
 
8d9fcd0
50f1b46
 
8d9fcd0
 
50f1b46
 
 
8d9fcd0
 
 
 
681b58a
 
a4b0424
1c8e78d
8d9fcd0
1c8e78d
8d9fcd0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# app.py — TalkClone (HF Space, 1-column, persistent output, DownloadButton)

import os, re, tempfile, shutil, time
import numpy as np
import soundfile as sf
import gradio as gr

os.environ.setdefault("COQUI_TOS_AGREED", "1")

MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"

LANGS = [
    ("English", "en"),
    ("Spanish", "es"),
    ("French",  "fr"),
    ("German",  "de"),
    ("Italian", "it"),
    ("Portuguese", "pt"),
    ("Polish",  "pl"),
    ("Turkish", "tr"),
    ("Russian", "ru"),
    ("Dutch",   "nl"),
    ("Czech",   "cs"),
    ("Arabic",  "ar"),
    ("Chinese (Simplified)", "zh-cn"),
    ("Hungarian", "hu"),
    ("Korean",  "ko"),
    ("Japanese","ja"),
    ("Hindi",   "hi"),
]
LANG_LABELS = [name for name, _ in LANGS]
LANG_MAP = {name: code for name, code in LANGS}

_tts = None
def get_tts():
    global _tts
    if _tts is not None:
        return _tts
    try:
        import torch
        try:
            torch.set_num_threads(max(1, min(4, os.cpu_count() or 2)))
        except Exception:
            pass
        use_gpu = torch.cuda.is_available()
    except Exception:
        use_gpu = False
    from TTS.api import TTS
    try:
        _tts = TTS(MODEL_NAME, gpu=use_gpu)
    except TypeError:
        _tts = TTS(MODEL_NAME)
    return _tts

def clean_text(t: str) -> str:
    return " ".join((t or "").strip().split())

def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
    try:
        tts.tts_to_file(text=txt, file_path=out_path,
                        speaker_wav=wav_path, language=lang, speed=speed)
    except TypeError:
        tts.tts_to_file(text=txt, file_path=out_path,
                        speaker_wav=wav_path, language=lang)

def safe_filename(seed_text: str, lang_code: str) -> str:
    base = clean_text(seed_text)[:40] or "talkclone"
    base = re.sub(r"[^A-Za-z0-9_-]+", "_", base).strip("_")
    ts = time.strftime("%Y%m%d-%H%M%S")
    return f"{base}_{lang_code}_{ts}.wav"

def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
    if ref_audio is None:
        raise gr.Error("Upload a reference voice (10–60s, clean speech).")
    text = clean_text(text)
    if not text:
        raise gr.Error("Please enter some text.")
    if len(text) > 1400 and not split_sentences:
        raise gr.Error("Text is very long. Enable 'Auto split' or paste a shorter chunk on CPU.")

    lang = LANG_MAP.get(lang_label, "en")
    wav_path = ref_audio

    chunks = [text]
    if split_sentences:
        rough = [s.strip() for s in re.split(r'(?<=[.!?؟۔]|[\u0964\u0965])\s+', text) if s.strip()]
        chunks = []
        for s in rough:
            if len(s) <= 220:
                chunks.append(s)
            else:
                for i in range(0, len(s), 200):
                    chunks.append(s[i:i+200])

    tts = get_tts()
    out_wavs = []
    with tempfile.TemporaryDirectory() as td:
        total = max(len(chunks), 1)
        for i, chunk in enumerate(chunks, 1):
            progress((i-1)/total, desc=f"Synthesizing {i}/{total}")
            part_path = os.path.join(td, f"part_{i}.wav")
            synth_to_file_safe(tts, chunk, part_path, wav_path, lang, speed)
            data, sr = sf.read(part_path)
            out_wavs.append((data, sr))

    # concat
    if len(out_wavs) == 1:
        final_data, sr = out_wavs[0]
    else:
        sr = out_wavs[0][1]
        final_data = np.concatenate([d for d, _ in out_wavs], axis=0)

    # write to persistent temp + copy to a nice-named path for downloading
    ntf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    ntf_path = ntf.name
    ntf.close()
    sf.write(ntf_path, final_data, sr)

    pretty_name = os.path.join("/tmp", safe_filename(text, lang))
    try:
        shutil.copyfile(ntf_path, pretty_name)
        dl_path = pretty_name
    except Exception:
        dl_path = ntf_path  # fallback

    # return both: audio preview path, and a file path for DownloadButton
    return ntf_path, dl_path

CUSTOM_CSS = """
.gradio-container { max-width: 860px !important; margin: 0 auto; }
#wrap, #ref, #lang, #txt, #spd, #split, #out_audio, #dl {
  background: #f8fafc !important;
  border: 1px solid #e5e7eb !important;
  border-radius: 14px !important;
  padding: 14px !important;
}
#ref, #out_audio, #dl { background: #eef2ff !important; }
#gen button, #gen { background: #10b981 !important; color: #fff !important; }
#gen button:hover { filter: brightness(0.95); }
/* hide HF/Gradio chrome */
footer, .footer, #footer,
a[href*="gradio.live"], a[href*="gradio.app"], a[href*="/api"], a[href*="hf.space"],
button[aria-label="Settings"],
[data-testid="block-analytics"], [data-testid="embed-info"] { display: none !important; }
"""

with gr.Blocks(title="TalkClone - Voice Cloning & TTS", css=CUSTOM_CSS, analytics_enabled=False) as demo:
    with gr.Column(elem_id="wrap"):
        gr.Markdown("## TalkClone — Text-to-Speech with Voice Cloning")
        gr.Markdown("Upload a short **reference voice** (10–60s), choose **language**, enter **text**, then **Generate**. "
                    "On free CPU, keep text short or enable **Auto split** for speed.")

        ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath", elem_id="ref")
        language  = gr.Dropdown(choices=LANG_LABELS, value="English", label="Language", elem_id="lang")
        text      = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…", elem_id="txt")
        speed     = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed", elem_id="spd")
        split     = gr.Checkbox(value=True, label="Auto split long text by sentence", elem_id="split")
        submit    = gr.Button("Generate", variant="primary", elem_id="gen")

        output   = gr.Audio(label="Cloned Speech", type="filepath", interactive=False, elem_id="out_audio")
        download = gr.DownloadButton(label="Download audio", elem_id="dl")

        def run_and_return(text, ref_audio, language, speed, split):
            audio_path, dl_path = tts_clone(text, ref_audio, language, speed, split)
            # set button to download the file we just wrote
            return audio_path, gr.update(value=dl_path, label=f"Download ({os.path.basename(dl_path)})")

        submit.click(run_and_return,
                     inputs=[text, ref_audio, language, speed, split],
                     outputs=[output, download])

if __name__ == "__main__":
    port = int(os.environ.get("PORT", "7860"))
    try:
        demo.queue().launch(server_name="0.0.0.0", server_port=port, show_error=True, show_api=False)
    except TypeError:
        demo.launch(server_name="0.0.0.0", server_port=port, show_error=True, show_api=False)