Spaces:

asbgig
/

talkclone

Running

File size: 6,743 Bytes

50f1b46
bf4353b
50f1b46
681b58a
 
a4b0424
bf4353b
 
 
681b58a
 
8d9fcd0
9aaaf3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d9fcd0
 
 
 
a4b0424
 
 
 
 
 
 
8d9fcd0
 
 
 
a4b0424
 
 
 
 
 
 
 
 
681b58a
a4b0424
 
681b58a
a4b0424
681b58a
1c8e78d
 
681b58a
1c8e78d
 
bf4353b
50f1b46
 
 
 
 
 
8d9fcd0
bf4353b
8d9fcd0
681b58a
 
 
8d9fcd0
 
 
 
681b58a
8d9fcd0
bf4353b
 
50f1b46
8d9fcd0
 
 
 
 
 
 
2c102d1
a4b0424
bf4353b
681b58a
8d9fcd0
681b58a
8d9fcd0
a4b0424
8d9fcd0
a4b0424
681b58a
 
50f1b46
9aaaf3c
 
 
 
 
bf4353b
50f1b46
 
 
 
 
9aaaf3c
50f1b46
 
 
 
 
 
 
 
 
681b58a
8d9fcd0
 
 
50f1b46
 
8d9fcd0
 
 
50f1b46
8d9fcd0
 
50f1b46
8d9fcd0
 
 
 
2c102d1
 
50f1b46
8d9fcd0
 
 
9aaaf3c
8d9fcd0
 
50f1b46
 
 
 
 
8d9fcd0
50f1b46
 
8d9fcd0
 
50f1b46
 
 
8d9fcd0
 
 
 
681b58a
 
a4b0424
1c8e78d
8d9fcd0
1c8e78d
8d9fcd0

# app.py — TalkClone (HF Space, 1-column, persistent output, DownloadButton)

import os, re, tempfile, shutil, time
import numpy as np
import soundfile as sf
import gradio as gr

os.environ.setdefault("COQUI_TOS_AGREED", "1")

MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"

LANGS = [
    ("English", "en"),
    ("Spanish", "es"),
    ("French",  "fr"),
    ("German",  "de"),
    ("Italian", "it"),
    ("Portuguese", "pt"),
    ("Polish",  "pl"),
    ("Turkish", "tr"),
    ("Russian", "ru"),
    ("Dutch",   "nl"),
    ("Czech",   "cs"),
    ("Arabic",  "ar"),
    ("Chinese (Simplified)", "zh-cn"),
    ("Hungarian", "hu"),
    ("Korean",  "ko"),
    ("Japanese","ja"),
    ("Hindi",   "hi"),
]
LANG_LABELS = [name for name, _ in LANGS]
LANG_MAP = {name: code for name, code in LANGS}

_tts = None
def get_tts():
    global _tts
    if _tts is not None:
        return _tts
    try:
        import torch
        try:
            torch.set_num_threads(max(1, min(4, os.cpu_count() or 2)))
        except Exception:
            pass
        use_gpu = torch.cuda.is_available()
    except Exception:
        use_gpu = False
    from TTS.api import TTS
    try:
        _tts = TTS(MODEL_NAME, gpu=use_gpu)
    except TypeError:
        _tts = TTS(MODEL_NAME)
    return _tts

def clean_text(t: str) -> str:
    return " ".join((t or "").strip().split())

def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
    try:
        tts.tts_to_file(text=txt, file_path=out_path,
                        speaker_wav=wav_path, language=lang, speed=speed)
    except TypeError:
        tts.tts_to_file(text=txt, file_path=out_path,
                        speaker_wav=wav_path, language=lang)

def safe_filename(seed_text: str, lang_code: str) -> str:
    base = clean_text(seed_text)[:40] or "talkclone"
    base = re.sub(r"[^A-Za-z0-9_-]+", "_", base).strip("_")
    ts = time.strftime("%Y%m%d-%H%M%S")
    return f"{base}_{lang_code}_{ts}.wav"

def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
    if ref_audio is None:
        raise gr.Error("Upload a reference voice (10–60s, clean speech).")
    text = clean_text(text)
    if not text:
        raise gr.Error("Please enter some text.")
    if len(text) > 1400 and not split_sentences:
        raise gr.Error("Text is very long. Enable 'Auto split' or paste a shorter chunk on CPU.")

    lang = LANG_MAP.get(lang_label, "en")
    wav_path = ref_audio

    chunks = [text]
    if split_sentences:
        rough = [s.strip() for s in re.split(r'(?<=[.!?؟۔]|[\u0964\u0965])\s+', text) if s.strip()]
        chunks = []
        for s in rough:
            if len(s) <= 220:
                chunks.append(s)
            else:
                for i in range(0, len(s), 200):
                    chunks.append(s[i:i+200])

    tts = get_tts()
    out_wavs = []
    with tempfile.TemporaryDirectory() as td:
        total = max(len(chunks), 1)
        for i, chunk in enumerate(chunks, 1):
            progress((i-1)/total, desc=f"Synthesizing {i}/{total}")
            part_path = os.path.join(td, f"part_{i}.wav")
            synth_to_file_safe(tts, chunk, part_path, wav_path, lang, speed)
            data, sr = sf.read(part_path)
            out_wavs.append((data, sr))

    # concat
    if len(out_wavs) == 1:
        final_data, sr = out_wavs[0]
    else:
        sr = out_wavs[0][1]
        final_data = np.concatenate([d for d, _ in out_wavs], axis=0)

    # write to persistent temp + copy to a nice-named path for downloading
    ntf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    ntf_path = ntf.name
    ntf.close()
    sf.write(ntf_path, final_data, sr)

    pretty_name = os.path.join("/tmp", safe_filename(text, lang))
    try:
        shutil.copyfile(ntf_path, pretty_name)
        dl_path = pretty_name
    except Exception:
        dl_path = ntf_path  # fallback

    # return both: audio preview path, and a file path for DownloadButton
    return ntf_path, dl_path

CUSTOM_CSS = """
.gradio-container { max-width: 860px !important; margin: 0 auto; }
#wrap, #ref, #lang, #txt, #spd, #split, #out_audio, #dl {
  background: #f8fafc !important;
  border: 1px solid #e5e7eb !important;
  border-radius: 14px !important;
  padding: 14px !important;
}
#ref, #out_audio, #dl { background: #eef2ff !important; }
#gen button, #gen { background: #10b981 !important; color: #fff !important; }
#gen button:hover { filter: brightness(0.95); }
/* hide HF/Gradio chrome */
footer, .footer, #footer,
a[href*="gradio.live"], a[href*="gradio.app"], a[href*="/api"], a[href*="hf.space"],
button[aria-label="Settings"],
[data-testid="block-analytics"], [data-testid="embed-info"] { display: none !important; }
"""

with gr.Blocks(title="TalkClone - Voice Cloning & TTS", css=CUSTOM_CSS, analytics_enabled=False) as demo:
    with gr.Column(elem_id="wrap"):
        gr.Markdown("## TalkClone — Text-to-Speech with Voice Cloning")
        gr.Markdown("Upload a short **reference voice** (10–60s), choose **language**, enter **text**, then **Generate**. "
                    "On free CPU, keep text short or enable **Auto split** for speed.")

        ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath", elem_id="ref")
        language  = gr.Dropdown(choices=LANG_LABELS, value="English", label="Language", elem_id="lang")
        text      = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…", elem_id="txt")
        speed     = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed", elem_id="spd")
        split     = gr.Checkbox(value=True, label="Auto split long text by sentence", elem_id="split")
        submit    = gr.Button("Generate", variant="primary", elem_id="gen")

        output   = gr.Audio(label="Cloned Speech", type="filepath", interactive=False, elem_id="out_audio")
        download = gr.DownloadButton(label="Download audio", elem_id="dl")

        def run_and_return(text, ref_audio, language, speed, split):
            audio_path, dl_path = tts_clone(text, ref_audio, language, speed, split)
            # set button to download the file we just wrote
            return audio_path, gr.update(value=dl_path, label=f"Download ({os.path.basename(dl_path)})")

        submit.click(run_and_return,
                     inputs=[text, ref_audio, language, speed, split],
                     outputs=[output, download])

if __name__ == "__main__":
    port = int(os.environ.get("PORT", "7860"))
    try:
        demo.queue().launch(server_name="0.0.0.0", server_port=port, show_error=True, show_api=False)
    except TypeError:
        demo.launch(server_name="0.0.0.0", server_port=port, show_error=True, show_api=False)