File size: 6,743 Bytes
50f1b46 bf4353b 50f1b46 681b58a a4b0424 bf4353b 681b58a 8d9fcd0 9aaaf3c 8d9fcd0 a4b0424 8d9fcd0 a4b0424 681b58a a4b0424 681b58a a4b0424 681b58a 1c8e78d 681b58a 1c8e78d bf4353b 50f1b46 8d9fcd0 bf4353b 8d9fcd0 681b58a 8d9fcd0 681b58a 8d9fcd0 bf4353b 50f1b46 8d9fcd0 2c102d1 a4b0424 bf4353b 681b58a 8d9fcd0 681b58a 8d9fcd0 a4b0424 8d9fcd0 a4b0424 681b58a 50f1b46 9aaaf3c bf4353b 50f1b46 9aaaf3c 50f1b46 681b58a 8d9fcd0 50f1b46 8d9fcd0 50f1b46 8d9fcd0 50f1b46 8d9fcd0 2c102d1 50f1b46 8d9fcd0 9aaaf3c 8d9fcd0 50f1b46 8d9fcd0 50f1b46 8d9fcd0 50f1b46 8d9fcd0 681b58a a4b0424 1c8e78d 8d9fcd0 1c8e78d 8d9fcd0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# app.py — TalkClone (HF Space, 1-column, persistent output, DownloadButton)
import os, re, tempfile, shutil, time
import numpy as np
import soundfile as sf
import gradio as gr
os.environ.setdefault("COQUI_TOS_AGREED", "1")
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
LANGS = [
("English", "en"),
("Spanish", "es"),
("French", "fr"),
("German", "de"),
("Italian", "it"),
("Portuguese", "pt"),
("Polish", "pl"),
("Turkish", "tr"),
("Russian", "ru"),
("Dutch", "nl"),
("Czech", "cs"),
("Arabic", "ar"),
("Chinese (Simplified)", "zh-cn"),
("Hungarian", "hu"),
("Korean", "ko"),
("Japanese","ja"),
("Hindi", "hi"),
]
LANG_LABELS = [name for name, _ in LANGS]
LANG_MAP = {name: code for name, code in LANGS}
_tts = None
def get_tts():
global _tts
if _tts is not None:
return _tts
try:
import torch
try:
torch.set_num_threads(max(1, min(4, os.cpu_count() or 2)))
except Exception:
pass
use_gpu = torch.cuda.is_available()
except Exception:
use_gpu = False
from TTS.api import TTS
try:
_tts = TTS(MODEL_NAME, gpu=use_gpu)
except TypeError:
_tts = TTS(MODEL_NAME)
return _tts
def clean_text(t: str) -> str:
return " ".join((t or "").strip().split())
def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
try:
tts.tts_to_file(text=txt, file_path=out_path,
speaker_wav=wav_path, language=lang, speed=speed)
except TypeError:
tts.tts_to_file(text=txt, file_path=out_path,
speaker_wav=wav_path, language=lang)
def safe_filename(seed_text: str, lang_code: str) -> str:
base = clean_text(seed_text)[:40] or "talkclone"
base = re.sub(r"[^A-Za-z0-9_-]+", "_", base).strip("_")
ts = time.strftime("%Y%m%d-%H%M%S")
return f"{base}_{lang_code}_{ts}.wav"
def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
if ref_audio is None:
raise gr.Error("Upload a reference voice (10–60s, clean speech).")
text = clean_text(text)
if not text:
raise gr.Error("Please enter some text.")
if len(text) > 1400 and not split_sentences:
raise gr.Error("Text is very long. Enable 'Auto split' or paste a shorter chunk on CPU.")
lang = LANG_MAP.get(lang_label, "en")
wav_path = ref_audio
chunks = [text]
if split_sentences:
rough = [s.strip() for s in re.split(r'(?<=[.!?؟۔]|[\u0964\u0965])\s+', text) if s.strip()]
chunks = []
for s in rough:
if len(s) <= 220:
chunks.append(s)
else:
for i in range(0, len(s), 200):
chunks.append(s[i:i+200])
tts = get_tts()
out_wavs = []
with tempfile.TemporaryDirectory() as td:
total = max(len(chunks), 1)
for i, chunk in enumerate(chunks, 1):
progress((i-1)/total, desc=f"Synthesizing {i}/{total}")
part_path = os.path.join(td, f"part_{i}.wav")
synth_to_file_safe(tts, chunk, part_path, wav_path, lang, speed)
data, sr = sf.read(part_path)
out_wavs.append((data, sr))
# concat
if len(out_wavs) == 1:
final_data, sr = out_wavs[0]
else:
sr = out_wavs[0][1]
final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
# write to persistent temp + copy to a nice-named path for downloading
ntf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
ntf_path = ntf.name
ntf.close()
sf.write(ntf_path, final_data, sr)
pretty_name = os.path.join("/tmp", safe_filename(text, lang))
try:
shutil.copyfile(ntf_path, pretty_name)
dl_path = pretty_name
except Exception:
dl_path = ntf_path # fallback
# return both: audio preview path, and a file path for DownloadButton
return ntf_path, dl_path
CUSTOM_CSS = """
.gradio-container { max-width: 860px !important; margin: 0 auto; }
#wrap, #ref, #lang, #txt, #spd, #split, #out_audio, #dl {
background: #f8fafc !important;
border: 1px solid #e5e7eb !important;
border-radius: 14px !important;
padding: 14px !important;
}
#ref, #out_audio, #dl { background: #eef2ff !important; }
#gen button, #gen { background: #10b981 !important; color: #fff !important; }
#gen button:hover { filter: brightness(0.95); }
/* hide HF/Gradio chrome */
footer, .footer, #footer,
a[href*="gradio.live"], a[href*="gradio.app"], a[href*="/api"], a[href*="hf.space"],
button[aria-label="Settings"],
[data-testid="block-analytics"], [data-testid="embed-info"] { display: none !important; }
"""
with gr.Blocks(title="TalkClone - Voice Cloning & TTS", css=CUSTOM_CSS, analytics_enabled=False) as demo:
with gr.Column(elem_id="wrap"):
gr.Markdown("## TalkClone — Text-to-Speech with Voice Cloning")
gr.Markdown("Upload a short **reference voice** (10–60s), choose **language**, enter **text**, then **Generate**. "
"On free CPU, keep text short or enable **Auto split** for speed.")
ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath", elem_id="ref")
language = gr.Dropdown(choices=LANG_LABELS, value="English", label="Language", elem_id="lang")
text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…", elem_id="txt")
speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed", elem_id="spd")
split = gr.Checkbox(value=True, label="Auto split long text by sentence", elem_id="split")
submit = gr.Button("Generate", variant="primary", elem_id="gen")
output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False, elem_id="out_audio")
download = gr.DownloadButton(label="Download audio", elem_id="dl")
def run_and_return(text, ref_audio, language, speed, split):
audio_path, dl_path = tts_clone(text, ref_audio, language, speed, split)
# set button to download the file we just wrote
return audio_path, gr.update(value=dl_path, label=f"Download ({os.path.basename(dl_path)})")
submit.click(run_and_return,
inputs=[text, ref_audio, language, speed, split],
outputs=[output, download])
if __name__ == "__main__":
port = int(os.environ.get("PORT", "7860"))
try:
demo.queue().launch(server_name="0.0.0.0", server_port=port, show_error=True, show_api=False)
except TypeError:
demo.launch(server_name="0.0.0.0", server_port=port, show_error=True, show_api=False)
|