Spaces:

asbgig
/

talkclone

Running

App Files Files Community

asbgig commited on Aug 22, 2025

Commit

a4b0424

verified ·

1 Parent(s): bf4353b

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -78

app.py CHANGED Viewed

@@ -1,32 +1,36 @@
-# app.py — TalkClone (HF Space, one-column, footer hidden)
-import os
-import tempfile
-import re
 import numpy as np
 import soundfile as sf
-# --- Coqui XTTS license prompt (must be set in headless envs like Spaces)
 os.environ.setdefault("COQUI_TOS_AGREED", "1")
-import gradio as gr
-from TTS.api import TTS
-# ----------------------------
-# Model: Coqui XTTS v2
-# ----------------------------
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
-# Try to use GPU when available (on HF, switch Space hardware to a GPU in Settings)
-try:
-    import torch
-    USE_GPU = torch.cuda.is_available()
-    tts = TTS(MODEL_NAME, gpu=USE_GPU)  # some versions accept gpu kwarg
-except Exception:
-    # Fallback (older/newer API variations)
-    tts = TTS(MODEL_NAME)
-# (label, value) pairs -> UI shows label, function receives code
 LANGS = [
     ("English", "en"),
     ("Urdu", "ur"),
@@ -40,103 +44,76 @@ LANGS = [
     ("Turkish", "tr"),
 ]
-def clean_text(text: str) -> str:
-    """Trim and collapse whitespace."""
-    return " ".join((text or "").strip().split())
-def synth_to_file_safe(txt, out_path, wav_path, lang, speed):
-    """
-    Call XTTS with 'speed' if supported; fall back without it if not.
-    Some XTTS builds ignore/raise on speed, so we guard it.
-    """
     try:
         tts.tts_to_file(
-            text=txt,
-            file_path=out_path,
-            speaker_wav=wav_path,
-            language=lang,
-            speed=speed,
         )
     except TypeError:
-        # Older/newer variants may not accept "speed"
         tts.tts_to_file(
-            text=txt,
-            file_path=out_path,
-            speaker_wav=wav_path,
-            language=lang,
         )
 def tts_clone(text, ref_audio, language_code, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
-    # Basic checks
     if ref_audio is None:
-        raise gr.Error("Please upload a reference voice sample (10–60 seconds of clean speech).")
     text = clean_text(text)
     if not text:
         raise gr.Error("Please enter some text.")
-    # Gradio passes a file path when type='filepath'
     wav_path = ref_audio
-    # Split long text into sentences (keeps memory lower on CPU; speeds up first output chunk)
     chunks = [text]
     if split_sentences:
         chunks = [s.strip() for s in re.split(r'(?<=[.!?؟۔])\s+', text) if s.strip()]
     out_wavs = []
     with tempfile.TemporaryDirectory() as td:
         for i, chunk in enumerate(chunks, 1):
-            progress((i - 1) / max(len(chunks), 1), desc=f"Synthesizing part {i}/{len(chunks)}")
-            out_path = os.path.join(td, f"part_{i}.wav")
-            synth_to_file_safe(chunk, out_path, wav_path, language_code, speed)
-            data, sr = sf.read(out_path)
             out_wavs.append((data, sr))
-        # Concatenate all parts
         if len(out_wavs) == 1:
             final_data, sr = out_wavs[0]
         else:
             sr = out_wavs[0][1]
             final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
-        # Save final output
         final_path = os.path.join(td, "output.wav")
         sf.write(final_path, final_data, sr)
         return final_path
-# ---- Minimal CSS: one column layout + hide footer / badges / settings
 HIDE_CSS = """
-/* one-column width */
 .gradio-container { max-width: 880px !important; margin: 0 auto; }
-/* hide footer + badges + "Use via API" strip */
-footer, .footer, #footer { display: none !important; }
-a[href*="gradio.live"], a[href*="gradio.app"], a[href*="hf.space"] { display: none !important; }
-/* hide top-right settings gear / menu in many themes */
-button[aria-label="Settings"], [data-testid="block-analytics"], [data-testid="embed-info"] { display:none !important; }
 """
-THEME = gr.themes.Soft(
-    primary_hue="indigo",
-    neutral_hue="slate",
-).set(
-    body_background_fill="*white",
-    button_primary_background_fill="*primary_500",
-    button_primary_background_fill_hover="*primary_600",
-    input_background_fill="*neutral_50",
-    input_border_color="*neutral_200",
-)
 with gr.Blocks(
     title="TalkClone - Voice Cloning & TTS",
-    theme=THEME,
     css=HIDE_CSS,
     analytics_enabled=False
 ) as demo:
     gr.Markdown("## TalkClone — Turn Text into Speech from a Reference Voice")
     gr.Markdown(
-        "Upload a short, clean **reference voice** (10–60s), pick a **language**, paste your **text**, and click **Generate**.\n\n"
-        "**Tip for speed:** long paragraphs synthesize faster if you keep them under ~20–30 seconds per sentence. "
-        "For best cloning quality, avoid background music/noise."
     )
     ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
@@ -144,14 +121,14 @@ with gr.Blocks(
     text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…")
     speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
     split = gr.Checkbox(value=True, label="Auto split long text by sentence")
-    submit = gr.Button("Generate", variant="primary", scale=1)
     output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
     download = gr.File(label="Download audio")
     def run_and_return(text, ref_audio, language, speed, split):
-        out_path = tts_clone(text, ref_audio, language, speed, split)
-        return out_path, out_path
     submit.click(
         run_and_return,
@@ -160,10 +137,11 @@ with gr.Blocks(
     )
 if __name__ == "__main__":
-    # On HF Spaces, don't open browser; show_api=False hides "Use via API".
     demo.queue(concurrency_count=1).launch(
         server_name="0.0.0.0",
-        server_port=7860,
         show_error=True,
-        show_api=False
     )

+# app.py — TalkClone (HF Space, one-column, footer hidden, binds to $PORT)
+import os, re, tempfile
 import numpy as np
 import soundfile as sf
+import gradio as gr
+# Accept Coqui license non-interactively (required on Spaces)
 os.environ.setdefault("COQUI_TOS_AGREED", "1")
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
+# Lazy-load TTS so the Space starts quickly and fails less often
+_tts = None
+def get_tts():
+    global _tts
+    if _tts is not None:
+        return _tts
+    # Try GPU if torch+CUDA is present; otherwise fall back to CPU.
+    try:
+        import torch
+        use_gpu = torch.cuda.is_available()
+    except Exception:
+        use_gpu = False
+    from TTS.api import TTS
+    try:
+        # Some versions accept gpu=…
+        _tts = TTS(MODEL_NAME, gpu=use_gpu)
+    except TypeError:
+        _tts = TTS(MODEL_NAME)
+    return _tts
 LANGS = [
     ("English", "en"),
     ("Urdu", "ur"),
     ("Turkish", "tr"),
 ]
+def clean_text(t: str) -> str:
+    return " ".join((t or "").strip().split())
+def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
+    # XTTS variants differ on "speed" support
     try:
         tts.tts_to_file(
+            text=txt, file_path=out_path,
+            speaker_wav=wav_path, language=lang, speed=speed
         )
     except TypeError:
         tts.tts_to_file(
+            text=txt, file_path=out_path,
+            speaker_wav=wav_path, language=lang
         )
 def tts_clone(text, ref_audio, language_code, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
     if ref_audio is None:
+        raise gr.Error("Please upload a reference voice sample (10–60 seconds, clean speech).")
     text = clean_text(text)
     if not text:
         raise gr.Error("Please enter some text.")
     wav_path = ref_audio
     chunks = [text]
     if split_sentences:
+        # Split on sentence boundaries including Urdu/Arabic punctuation
         chunks = [s.strip() for s in re.split(r'(?<=[.!?؟۔])\s+', text) if s.strip()]
+    tts = get_tts()
     out_wavs = []
     with tempfile.TemporaryDirectory() as td:
         for i, chunk in enumerate(chunks, 1):
+            progress((i-1)/max(len(chunks),1), desc=f"Synthesizing {i}/{len(chunks)}")
+            part_path = os.path.join(td, f"part_{i}.wav")
+            synth_to_file_safe(tts, chunk, part_path, wav_path, language_code, speed)
+            data, sr = sf.read(part_path)
             out_wavs.append((data, sr))
         if len(out_wavs) == 1:
             final_data, sr = out_wavs[0]
         else:
             sr = out_wavs[0][1]
             final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
         final_path = os.path.join(td, "output.wav")
         sf.write(final_path, final_data, sr)
         return final_path
+# ---- Minimal CSS: one column + hide footer / badges / settings
 HIDE_CSS = """
+/* compact one-column center */
 .gradio-container { max-width: 880px !important; margin: 0 auto; }
+/* hide footer & badges & embed/info areas */
+footer, .footer, #footer, [data-testid="block-analytics"], [data-testid="embed-info"] { display:none !important; }
+a[href*="gradio.live"], a[href*="gradio.app"], a[href*="hf.space"] { display:none !important; }
+/* hide settings button in many themes */
+button[aria-label="Settings"] { display:none !important; }
 """
 with gr.Blocks(
     title="TalkClone - Voice Cloning & TTS",
     css=HIDE_CSS,
     analytics_enabled=False
 ) as demo:
     gr.Markdown("## TalkClone — Turn Text into Speech from a Reference Voice")
     gr.Markdown(
+        "Upload a short **reference voice** (10–60s), choose **language**, enter **text**, click **Generate**.\n"
+        "**Tip:** Long texts are split by sentence for reliability; shorter sentences synthesize faster."
     )
     ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
     text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…")
     speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
     split = gr.Checkbox(value=True, label="Auto split long text by sentence")
+    submit = gr.Button("Generate", variant="primary")
     output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
     download = gr.File(label="Download audio")
     def run_and_return(text, ref_audio, language, speed, split):
+        path = tts_clone(text, ref_audio, language, speed, split)
+        return path, path
     submit.click(
         run_and_return,
     )
 if __name__ == "__main__":
+    # IMPORTANT on Spaces: bind to the port Spaces gives you
+    port = int(os.environ.get("PORT", "7860"))
     demo.queue(concurrency_count=1).launch(
         server_name="0.0.0.0",
+        server_port=port,
         show_error=True,
+        show_api=False,
     )