Spaces:

asbgig
/

talkclone

Runtime error

App Files Files Community

asbgig commited on Aug 22, 2025

Commit

bf4353b

verified ·

1 Parent(s): 2c102d1

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -88

app.py CHANGED Viewed

@@ -1,128 +1,169 @@
-import os, re, tempfile
 import numpy as np
 import soundfile as sf
 import gradio as gr
 from TTS.api import TTS
-# -------- speed / device --------
-USE_GPU = os.environ.get("USE_GPU", "1") == "1"  # set to 1 if you switch Space to GPU (T4, A10G)
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
-_tts = None
-def get_tts():
-    global _tts
-    if _tts is None:
-        t = TTS(MODEL_NAME)
-        try:
-            if USE_GPU:
-                t = t.to("cuda")
-        except Exception:
-            pass
-        _tts = t
-    return _tts
 LANGS = [
-    ("English","en"),("Urdu","ur"),("Hindi","hi"),("Arabic","ar"),
-    ("French","fr"),("German","de"),("Spanish","es"),("Italian","it"),
-    ("Portuguese","pt"),("Turkish","tr"),
 ]
-def clean_text(text:str)->str:
     return " ".join((text or "").strip().split())
-def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
     try:
-        tts.tts_to_file(text=txt, file_path=out_path,
-                        speaker_wav=wav_path, language=lang, speed=speed)
     except TypeError:
-        tts.tts_to_file(text=txt, file_path=out_path,
-                        speaker_wav=wav_path, language=lang)
-def split_sentences(text:str):
-    parts = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
-    return parts or [text]
-def tts_clone(text, ref_audio, language_code, speed, split_long, progress=gr.Progress()):
-    if not ref_audio:
-        raise gr.Error("Please upload a reference voice sample (10–60 seconds).")
     text = clean_text(text)
     if not text:
         raise gr.Error("Please enter some text.")
-    # hard guard for CPU: very long text can take a long time
-    if not USE_GPU and len(text) > 600:
-        raise gr.Error("Text is long for CPU. Please try ≤ 600 characters, or switch the Space to a GPU for long texts.")
-    tts = get_tts()
     wav_path = ref_audio
-    chunks = split_sentences(text) if split_long else [text]
-    out_wavs = []
     with tempfile.TemporaryDirectory() as td:
         for i, chunk in enumerate(chunks, 1):
-            progress((i-1)/max(1,len(chunks)), desc=f"Generating part {i}/{len(chunks)}")
             out_path = os.path.join(td, f"part_{i}.wav")
-            synth_to_file_safe(tts, chunk, out_path, wav_path, language_code, speed)
             data, sr = sf.read(out_path)
             out_wavs.append((data, sr))
-        sr = out_wavs[0][1]
-        final = out_wavs[0][0] if len(out_wavs)==1 else np.concatenate([d for d,_ in out_wavs], axis=0)
         final_path = os.path.join(td, "output.wav")
-        sf.write(final_path, final, sr)
         return final_path
-# ---------------- UI ----------------
-THEME = gr.themes.Soft(
-    primary_hue="blue", neutral_hue="slate"
-).set(
-    body_background_fill="#ffffff",
-    block_background_fill="#ffffff",
-    block_border_width="1px",
-    block_border_color="#e5e7eb",
-    radius_xl="14px"
-)
-CUSTOM_CSS = """
-/* one-column layout */
-.container {max-width: 880px; margin: 24px auto;}
-/* hide footer (“Built with Gradio”) */
-footer { display: none !important; }
-/* hide top-right toolbar (API / Settings / etc.) */
-button[aria-label="Use via API"],
-button[aria-label="Settings"],
-a[href*="gradio.app"] { display:none !important; }
-/* tighten widgets */
-.gradio-container .wrap {gap: 8px;}
 """
-with gr.Blocks(theme=THEME, css=CUSTOM_CSS, fill_height=True, title="TalkClone - Voice Cloning & TTS") as demo:
-    gr.HTML('<div class="container"><h1 style="margin:0 0 8px;font-weight:700;">TalkClone — Clone a voice & generate speech</h1>'
-            '<p style="margin:0 0 16px;color:#334155;">Upload a clean reference (10–60s), choose language, enter text, then Generate.</p></div>')
-    with gr.Group():
-        with gr.Column(scale=1):
-            ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
-            language = gr.Dropdown(choices=LANGS, value="en", label="Language")
-            text = gr.Textbox(label="Text", lines=6, placeholder="Type your text here…")
-            speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
-            split = gr.Checkbox(value=True, label="Auto split long text by sentence")
-            generate = gr.Button("Generate", variant="primary", scale=1)
-    with gr.Group():
-        output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
-        download = gr.File(label="Download audio")
     def run_and_return(text, ref_audio, language, speed, split):
-        p = tts_clone(text, ref_audio, language, speed, split)
-        return p, p
-    generate.click(run_and_return,
-                   inputs=[text, ref_audio, language, speed, split],
-                   outputs=[output, download])
 if __name__ == "__main__":
-    # hide “Use via API”, keep errors off in production
-    demo.launch(show_api=False, show_error=False)

+# app.py — TalkClone (HF Space, one-column, footer hidden)
+import os
+import tempfile
+import re
 import numpy as np
 import soundfile as sf
+# --- Coqui XTTS license prompt (must be set in headless envs like Spaces)
+os.environ.setdefault("COQUI_TOS_AGREED", "1")
 import gradio as gr
 from TTS.api import TTS
+# ----------------------------
+# Model: Coqui XTTS v2
+# ----------------------------
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
+# Try to use GPU when available (on HF, switch Space hardware to a GPU in Settings)
+try:
+    import torch
+    USE_GPU = torch.cuda.is_available()
+    tts = TTS(MODEL_NAME, gpu=USE_GPU)  # some versions accept gpu kwarg
+except Exception:
+    # Fallback (older/newer API variations)
+    tts = TTS(MODEL_NAME)
+# (label, value) pairs -> UI shows label, function receives code
 LANGS = [
+    ("English", "en"),
+    ("Urdu", "ur"),
+    ("Hindi", "hi"),
+    ("Arabic", "ar"),
+    ("French", "fr"),
+    ("German", "de"),
+    ("Spanish", "es"),
+    ("Italian", "it"),
+    ("Portuguese", "pt"),
+    ("Turkish", "tr"),
 ]
+def clean_text(text: str) -> str:
+    """Trim and collapse whitespace."""
     return " ".join((text or "").strip().split())
+def synth_to_file_safe(txt, out_path, wav_path, lang, speed):
+    """
+    Call XTTS with 'speed' if supported; fall back without it if not.
+    Some XTTS builds ignore/raise on speed, so we guard it.
+    """
     try:
+        tts.tts_to_file(
+            text=txt,
+            file_path=out_path,
+            speaker_wav=wav_path,
+            language=lang,
+            speed=speed,
+        )
     except TypeError:
+        # Older/newer variants may not accept "speed"
+        tts.tts_to_file(
+            text=txt,
+            file_path=out_path,
+            speaker_wav=wav_path,
+            language=lang,
+        )
+def tts_clone(text, ref_audio, language_code, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
+    # Basic checks
+    if ref_audio is None:
+        raise gr.Error("Please upload a reference voice sample (10–60 seconds of clean speech).")
     text = clean_text(text)
     if not text:
         raise gr.Error("Please enter some text.")
+    # Gradio passes a file path when type='filepath'
     wav_path = ref_audio
+    # Split long text into sentences (keeps memory lower on CPU; speeds up first output chunk)
+    chunks = [text]
+    if split_sentences:
+        chunks = [s.strip() for s in re.split(r'(?<=[.!?؟۔])\s+', text) if s.strip()]
+    out_wavs = []
     with tempfile.TemporaryDirectory() as td:
         for i, chunk in enumerate(chunks, 1):
+            progress((i - 1) / max(len(chunks), 1), desc=f"Synthesizing part {i}/{len(chunks)}")
             out_path = os.path.join(td, f"part_{i}.wav")
+            synth_to_file_safe(chunk, out_path, wav_path, language_code, speed)
             data, sr = sf.read(out_path)
             out_wavs.append((data, sr))
+        # Concatenate all parts
+        if len(out_wavs) == 1:
+            final_data, sr = out_wavs[0]
+        else:
+            sr = out_wavs[0][1]
+            final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
+        # Save final output
         final_path = os.path.join(td, "output.wav")
+        sf.write(final_path, final_data, sr)
         return final_path
+# ---- Minimal CSS: one column layout + hide footer / badges / settings
+HIDE_CSS = """
+/* one-column width */
+.gradio-container { max-width: 880px !important; margin: 0 auto; }
+/* hide footer + badges + "Use via API" strip */
+footer, .footer, #footer { display: none !important; }
+a[href*="gradio.live"], a[href*="gradio.app"], a[href*="hf.space"] { display: none !important; }
+/* hide top-right settings gear / menu in many themes */
+button[aria-label="Settings"], [data-testid="block-analytics"], [data-testid="embed-info"] { display:none !important; }
 """
+THEME = gr.themes.Soft(
+    primary_hue="indigo",
+    neutral_hue="slate",
+).set(
+    body_background_fill="*white",
+    button_primary_background_fill="*primary_500",
+    button_primary_background_fill_hover="*primary_600",
+    input_background_fill="*neutral_50",
+    input_border_color="*neutral_200",
+)
+with gr.Blocks(
+    title="TalkClone - Voice Cloning & TTS",
+    theme=THEME,
+    css=HIDE_CSS,
+    analytics_enabled=False
+) as demo:
+    gr.Markdown("## TalkClone — Turn Text into Speech from a Reference Voice")
+    gr.Markdown(
+        "Upload a short, clean **reference voice** (10–60s), pick a **language**, paste your **text**, and click **Generate**.\n\n"
+        "**Tip for speed:** long paragraphs synthesize faster if you keep them under ~20–30 seconds per sentence. "
+        "For best cloning quality, avoid background music/noise."
+    )
+    ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
+    language = gr.Dropdown(choices=LANGS, value="en", label="Language")
+    text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…")
+    speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
+    split = gr.Checkbox(value=True, label="Auto split long text by sentence")
+    submit = gr.Button("Generate", variant="primary", scale=1)
+    output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
+    download = gr.File(label="Download audio")
     def run_and_return(text, ref_audio, language, speed, split):
+        out_path = tts_clone(text, ref_audio, language, speed, split)
+        return out_path, out_path
+    submit.click(
+        run_and_return,
+        inputs=[text, ref_audio, language, speed, split],
+        outputs=[output, download]
+    )
 if __name__ == "__main__":
+    # On HF Spaces, don't open browser; show_api=False hides "Use via API".
+    demo.queue(concurrency_count=1).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        show_api=False
+    )