talkclone

Build error

App Files Files Community

asbgig commited on Aug 23, 2025

Commit

9aaaf3c

verified ·

1 Parent(s): 256aa14

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -22

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
-# app.py — TalkClone (HF Space, 1-column, custom styles, CPU-friendly)
-import os, re, tempfile
 import numpy as np
 import soundfile as sf
 import gradio as gr
@@ -10,11 +10,25 @@ os.environ.setdefault("COQUI_TOS_AGREED", "1")
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
-# Show labels, send codes
 LANGS = [
-    ("English","en"), ("Urdu","ur"), ("Hindi","hi"), ("Arabic","ar"),
-    ("French","fr"), ("German","de"), ("Spanish","es"), ("Italian","it"),
-    ("Portuguese","pt"), ("Turkish","tr"),
 ]
 LANG_LABELS = [name for name, _ in LANGS]
 LANG_MAP = {name: code for name, code in LANGS}
@@ -27,7 +41,6 @@ def get_tts():
         return _tts
     try:
         import torch
-        # Use all available CPU threads on Basic (usually 2 vCPU)
         try:
             torch.set_num_threads(max(1, min(4, os.cpu_count() or 2)))
         except Exception:
@@ -35,6 +48,7 @@ def get_tts():
         use_gpu = torch.cuda.is_available()
     except Exception:
         use_gpu = False
     from TTS.api import TTS
     try:
         _tts = TTS(MODEL_NAME, gpu=use_gpu)
@@ -60,28 +74,29 @@ def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.P
     if not text:
         raise gr.Error("Please enter some text.")
-    # Limit extremely long jobs on CPU Basic
     if len(text) > 1400 and not split_sentences:
         raise gr.Error("Text is very long. Enable 'Auto split' or paste a shorter chunk on CPU.")
     lang = LANG_MAP.get(lang_label, "en")
     wav_path = ref_audio
-    # Sentence split + also break very long sentences into ~180–220 chars
     chunks = [text]
     if split_sentences:
-        rough = [s.strip() for s in re.split(r'(?<=[.!?؟۔])\s+', text) if s.strip()]
         chunks = []
         for s in rough:
             if len(s) <= 220:
                 chunks.append(s)
             else:
-                # soft wrap long lines
                 for i in range(0, len(s), 200):
                     chunks.append(s[i:i+200])
     tts = get_tts()
     out_wavs = []
     with tempfile.TemporaryDirectory() as td:
         total = max(len(chunks), 1)
         for i, chunk in enumerate(chunks, 1):
@@ -91,16 +106,19 @@ def tts_clone(text, ref_audio, lang_label, speed, split_sentences, progress=gr.P
             data, sr = sf.read(part_path)
             out_wavs.append((data, sr))
-        # Concatenate
-        if len(out_wavs) == 1:
-            final_data, sr = out_wavs[0]
-        else:
-            sr = out_wavs[0][1]
-            final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
-        final_path = os.path.join(td, "output.wav")
-        sf.write(final_path, final_data, sr)
-        return final_path
 # ==== Styles (1 column + colors + hide HF/Gradio UI chrome) ====
 CUSTOM_CSS = """
@@ -113,6 +131,9 @@ CUSTOM_CSS = """
   padding: 14px !important;
 }
 /* Primary button color */
 #gen button, #gen { background: #10b981 !important; color: #fff !important; }
 #gen button:hover { filter: brightness(0.95); }
@@ -132,10 +153,10 @@ with gr.Blocks(
     with gr.Column(elem_id="wrap"):
         gr.Markdown("## TalkClone — Text-to-Speech with Voice Cloning")
         gr.Markdown("Upload a short **reference voice** (10–60s), choose **language**, enter **text**, then **Generate**. "
-                    "On CPU Basic, keep text short or enable **Auto split** for speed.")
         ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath", elem_id="ref")
-        language = gr.Dropdown(choices=LANG_LABELS, value="English", label="Language", elem_id="lang")
         text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…", elem_id="txt")
         speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed", elem_id="spd")
         split = gr.Checkbox(value=True, label="Auto split long text by sentence", elem_id="split")

+# app.py — TalkClone (HF Space, 1-column, persistent output, CPU-friendly)
+import os, re, tempfile, shutil
 import numpy as np
 import soundfile as sf
 import gradio as gr
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
+# Show labels, send codes (XTTS v2 supported only)
 LANGS = [
+    ("English", "en"),
+    ("Spanish", "es"),
+    ("French",  "fr"),
+    ("German",  "de"),
+    ("Italian", "it"),
+    ("Portuguese", "pt"),
+    ("Polish",  "pl"),
+    ("Turkish", "tr"),
+    ("Russian", "ru"),
+    ("Dutch",   "nl"),
+    ("Czech",   "cs"),
+    ("Arabic",  "ar"),
+    ("Chinese (Simplified)", "zh-cn"),
+    ("Hungarian", "hu"),
+    ("Korean",  "ko"),
+    ("Japanese","ja"),
+    ("Hindi",   "hi"),
 ]
 LANG_LABELS = [name for name, _ in LANGS]
 LANG_MAP = {name: code for name, code in LANGS}
         return _tts
     try:
         import torch
         try:
             torch.set_num_threads(max(1, min(4, os.cpu_count() or 2)))
         except Exception:
         use_gpu = torch.cuda.is_available()
     except Exception:
         use_gpu = False
     from TTS.api import TTS
     try:
         _tts = TTS(MODEL_NAME, gpu=use_gpu)
     if not text:
         raise gr.Error("Please enter some text.")
+    # Limit extremely long jobs on free CPU
     if len(text) > 1400 and not split_sentences:
         raise gr.Error("Text is very long. Enable 'Auto split' or paste a shorter chunk on CPU.")
     lang = LANG_MAP.get(lang_label, "en")
     wav_path = ref_audio
+    # Sentence split + also break very long sentences into ~200 chars
     chunks = [text]
     if split_sentences:
+        rough = [s.strip() for s in re.split(r'(?<=[.!?؟。．。،،]|[\u0964\u0965])\s+', text) if s.strip()]
         chunks = []
         for s in rough:
             if len(s) <= 220:
                 chunks.append(s)
             else:
                 for i in range(0, len(s), 200):
                     chunks.append(s[i:i+200])
     tts = get_tts()
     out_wavs = []
+    # Use a temp dir for parts, but write the FINAL file to a persistent temp path
     with tempfile.TemporaryDirectory() as td:
         total = max(len(chunks), 1)
         for i, chunk in enumerate(chunks, 1):
             data, sr = sf.read(part_path)
             out_wavs.append((data, sr))
+    # Concatenate and save to a persistent temp file that survives function return
+    if len(out_wavs) == 1:
+        final_data, sr = out_wavs[0]
+    else:
+        sr = out_wavs[0][1]
+        final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
+    persistent_tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    persistent_tmp_path = persistent_tmp.name
+    persistent_tmp.close()  # path remains; we write to it next
+    sf.write(persistent_tmp_path, final_data, sr)
+    return persistent_tmp_path
 # ==== Styles (1 column + colors + hide HF/Gradio UI chrome) ====
 CUSTOM_CSS = """
   padding: 14px !important;
 }
+/* Make the component surfaces non-white */
+#ref, #out_audio, #dl { background: #eef2ff !important; } /* indigo-50-ish */
 /* Primary button color */
 #gen button, #gen { background: #10b981 !important; color: #fff !important; }
 #gen button:hover { filter: brightness(0.95); }
     with gr.Column(elem_id="wrap"):
         gr.Markdown("## TalkClone — Text-to-Speech with Voice Cloning")
         gr.Markdown("Upload a short **reference voice** (10–60s), choose **language**, enter **text**, then **Generate**. "
+                    "On free CPU, keep text short or enable **Auto split** for speed.")
         ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath", elem_id="ref")
+        language = gr.Dropdown(choices=[name for name, _ in LANGS], value="English", label="Language", elem_id="lang")
         text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…", elem_id="txt")
         speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed", elem_id="spd")
         split = gr.Checkbox(value=True, label="Auto split long text by sentence", elem_id="split")