# app.py — TalkClone (HF Space, 1-column, persistent output, DownloadButton) import os, re, tempfile, shutil, time import numpy as np import soundfile as sf import gradio as gr os.environ.setdefault("COQUI_TOS_AGREED", "1") MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2" LANGS = [ ("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"), ("Italian", "it"), ("Portuguese", "pt"), ("Polish", "pl"), ("Turkish", "tr"), ("Russian", "ru"), ("Dutch", "nl"), ("Czech", "cs"), ("Arabic", "ar"), ("Chinese (Simplified)", "zh-cn"), ("Hungarian", "hu"), ("Korean", "ko"), ("Japanese","ja"), ("Hindi", "hi"), ] LANG_LABELS = [name for name, _ in LANGS] LANG_MAP = {name: code for name, code in LANGS} _tts = None def get_tts(): global _tts if _tts is not None: return _tts try: import torch try: torch.set_num_threads(max(1, min(4, os.cpu_count() or 2))) except Exception: pass use_gpu = torch.cuda.is_available() except Exception: use_gpu = False from TTS.api import TTS try: _tts = TTS(MODEL_NAME, gpu=use_gpu) except TypeError: _tts = TTS(MODEL_NAME) return _tts def clean_text(t: str) -> str: return " ".join((t or "").strip().split()) def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed): try: tts.tts_to_file(text=txt, file_path=out_path, speaker_wav=wav_path, language=lang, speed=speed) except TypeError: tts.tts_to_file(text=txt, file_path=out_path, speaker_wav=wav_path, language=lang) def safe_filename(seed_text: str, lang_code: str) -> str: base = clean_text(seed_text)[:40] or "talkclone" base = re.sub(r"[^A-Za-z0-9_-]+", "_", base).strip("_") ts = time.strftime("%Y%m%d-%H%M%S") return f"{base}_{lang_code}_{ts}.wav" def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.Progress(track_tqdm=True)): if ref_audio is None: raise gr.Error("Upload a reference voice (10–60s, clean speech).") text = clean_text(text) if not text: raise gr.Error("Please enter some text.") if len(text) > 1400 and not split_sentences: raise gr.Error("Text is very long. Enable 'Auto split' or paste a shorter chunk on CPU.") lang = LANG_MAP.get(lang_label, "en") wav_path = ref_audio chunks = [text] if split_sentences: rough = [s.strip() for s in re.split(r'(?<=[.!?؟۔]|[\u0964\u0965])\s+', text) if s.strip()] chunks = [] for s in rough: if len(s) <= 220: chunks.append(s) else: for i in range(0, len(s), 200): chunks.append(s[i:i+200]) tts = get_tts() out_wavs = [] with tempfile.TemporaryDirectory() as td: total = max(len(chunks), 1) for i, chunk in enumerate(chunks, 1): progress((i-1)/total, desc=f"Synthesizing {i}/{total}") part_path = os.path.join(td, f"part_{i}.wav") synth_to_file_safe(tts, chunk, part_path, wav_path, lang, speed) data, sr = sf.read(part_path) out_wavs.append((data, sr)) # concat if len(out_wavs) == 1: final_data, sr = out_wavs[0] else: sr = out_wavs[0][1] final_data = np.concatenate([d for d, _ in out_wavs], axis=0) # write to persistent temp + copy to a nice-named path for downloading ntf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) ntf_path = ntf.name ntf.close() sf.write(ntf_path, final_data, sr) pretty_name = os.path.join("/tmp", safe_filename(text, lang)) try: shutil.copyfile(ntf_path, pretty_name) dl_path = pretty_name except Exception: dl_path = ntf_path # fallback # return both: audio preview path, and a file path for DownloadButton return ntf_path, dl_path CUSTOM_CSS = """ .gradio-container { max-width: 860px !important; margin: 0 auto; } #wrap, #ref, #lang, #txt, #spd, #split, #out_audio, #dl { background: #f8fafc !important; border: 1px solid #e5e7eb !important; border-radius: 14px !important; padding: 14px !important; } #ref, #out_audio, #dl { background: #eef2ff !important; } #gen button, #gen { background: #10b981 !important; color: #fff !important; } #gen button:hover { filter: brightness(0.95); } /* hide HF/Gradio chrome */ footer, .footer, #footer, a[href*="gradio.live"], a[href*="gradio.app"], a[href*="/api"], a[href*="hf.space"], button[aria-label="Settings"], [data-testid="block-analytics"], [data-testid="embed-info"] { display: none !important; } """ with gr.Blocks(title="TalkClone - Voice Cloning & TTS", css=CUSTOM_CSS, analytics_enabled=False) as demo: with gr.Column(elem_id="wrap"): gr.Markdown("## TalkClone — Text-to-Speech with Voice Cloning") gr.Markdown("Upload a short **reference voice** (10–60s), choose **language**, enter **text**, then **Generate**. " "On free CPU, keep text short or enable **Auto split** for speed.") ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath", elem_id="ref") language = gr.Dropdown(choices=LANG_LABELS, value="English", label="Language", elem_id="lang") text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…", elem_id="txt") speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed", elem_id="spd") split = gr.Checkbox(value=True, label="Auto split long text by sentence", elem_id="split") submit = gr.Button("Generate", variant="primary", elem_id="gen") output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False, elem_id="out_audio") download = gr.DownloadButton(label="Download audio", elem_id="dl") def run_and_return(text, ref_audio, language, speed, split): audio_path, dl_path = tts_clone(text, ref_audio, language, speed, split) # set button to download the file we just wrote return audio_path, gr.update(value=dl_path, label=f"Download ({os.path.basename(dl_path)})") submit.click(run_and_return, inputs=[text, ref_audio, language, speed, split], outputs=[output, download]) if __name__ == "__main__": port = int(os.environ.get("PORT", "7860")) try: demo.queue().launch(server_name="0.0.0.0", server_port=port, show_error=True, show_api=False) except TypeError: demo.launch(server_name="0.0.0.0", server_port=port, show_error=True, show_api=False)