Spaces:

drixo
/

Translator

Sleeping

App Files Files Community

drixo commited on Sep 4, 2025

Commit

b0d995c

verified ·

1 Parent(s): aa4d216

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -40

app.py CHANGED Viewed

@@ -2,29 +2,23 @@ import os
 import tempfile
 import gradio as gr
 from huggingface_hub import snapshot_download
-# If torch is optional for you, you can keep this minimal
 import torch
-# Import after deps are installed (handled by requirements.txt)
 from indextts.infer import IndexTTS
 CHECKPOINTS_DIR = os.path.abspath("checkpoints")
 def load_model():
     """
-    Download model weights (if needed) and initialize IndexTTS once.
-    Avoids the 'checkpoints/checkpoints' double-path bug by using the exact
-    path returned from snapshot_download.
     """
     os.makedirs(CHECKPOINTS_DIR, exist_ok=True)
-    # Download to a fixed directory; do NOT prefix this path again later.
     repo_path = snapshot_download(
         repo_id="mlx-community/IndexTTS",
         local_dir=CHECKPOINTS_DIR,
-        local_dir_use_symlinks=False,   # ensures real files (safer in Spaces)
         allow_patterns=[
             "config.yaml",
             "bpe.model",
@@ -36,7 +30,14 @@ def load_model():
         ],
     )
-    # Optional: keep CPU stable in Spaces and prevent over-threading
     os.environ.setdefault("OMP_NUM_THREADS", "1")
     os.environ.setdefault("MKL_NUM_THREADS", "1")
     try:
@@ -44,12 +45,11 @@ def load_model():
     except Exception:
         pass
-    # Initialize IndexTTS. IMPORTANT: pass repo_path directly.
-    tts = IndexTTS(model_dir=repo_path, cfg_path=os.path.join(repo_path, "config.yaml"))
     return tts
-# Global singleton (loaded once on Space startup)
 _tts = None
 def get_tts():
     global _tts
@@ -57,37 +57,32 @@ def get_tts():
         _tts = load_model()
     return _tts
 def synthesize(voice_path, text):
     """
     Gradio inference function.
-    - voice_path: path to uploaded reference voice (WAV strongly recommended)
-    - text: the text to speak
-    Returns (output_wav_path)
     """
     if not voice_path or not os.path.exists(voice_path):
         raise gr.Error("Please upload a short reference voice clip (WAV recommended).")
     if not text or not text.strip():
-        raise gr.Error("Please enter the text to speak.")
     tts = get_tts()
-    # Write output to a temporary WAV file; Gradio will serve it.
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         out_path = tmp.name
-    # Minimal call; IndexTTS handles normalization/phonemization internally.
-    # You can add extra kwargs if the library exposes them (e.g., speed, seed).
     tts.infer(voice_path, text.strip(), out_path)
     return out_path
 title = "IndexTTS – Zero-shot Voice Cloning (HF Space)"
 description = """
 Upload a short **reference voice** (5–10s, clean speech works best) and enter text.
-This Space runs **IndexTTS** in CPU mode by default, so first run may take a bit to warm up.
 """
 with gr.Blocks() as demo:
@@ -95,30 +90,21 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            voice = gr.Audio(
-                sources=["upload"],
-                type="filepath",
-                label="Reference Voice (WAV preferred)"
-            )
-            text = gr.Textbox(
-                label="Text to Synthesize",
-                placeholder="Hello, how are you?",
-                lines=3
-            )
             btn = gr.Button("Generate Speech")
         with gr.Column():
             audio_out = gr.Audio(label="Output Audio", type="filepath")
             log = gr.Markdown("")
     btn.click(fn=synthesize, inputs=[voice, text], outputs=[audio_out])
-# Optional: pre-load at startup so first user call is faster
 def _startup():
     try:
         get_tts()
     except Exception as e:
-        # Don't crash the Space if warmup fails; show a note in Logs.
         print("Warmup failed:", e)
 if __name__ == "__main__":

 import tempfile
 import gradio as gr
 from huggingface_hub import snapshot_download
 import torch
 from indextts.infer import IndexTTS
+# Directory to store downloaded model files
 CHECKPOINTS_DIR = os.path.abspath("checkpoints")
 def load_model():
     """
+    Download IndexTTS model weights (if needed) and initialize IndexTTS once.
     """
     os.makedirs(CHECKPOINTS_DIR, exist_ok=True)
+    # Download weights from HF Hub
     repo_path = snapshot_download(
         repo_id="mlx-community/IndexTTS",
         local_dir=CHECKPOINTS_DIR,
+        local_dir_use_symlinks=False,
         allow_patterns=[
             "config.yaml",
             "bpe.model",
         ],
     )
+    # Debug: verify files
+    print("Downloaded files:", os.listdir(repo_path))
+    cfg_file = os.path.join(repo_path, "config.yaml")
+    if not os.path.exists(cfg_file):
+        raise FileNotFoundError(f"Cannot find config.yaml in {repo_path}. Check repo contents.")
+    # Limit CPU threads for Spaces
     os.environ.setdefault("OMP_NUM_THREADS", "1")
     os.environ.setdefault("MKL_NUM_THREADS", "1")
     try:
     except Exception:
         pass
+    # Initialize IndexTTS
+    tts = IndexTTS(model_dir=repo_path, cfg_path=cfg_file)
     return tts
+# Global singleton for TTS
 _tts = None
 def get_tts():
     global _tts
         _tts = load_model()
     return _tts
 def synthesize(voice_path, text):
     """
     Gradio inference function.
+    voice_path: path to reference voice (WAV recommended)
+    text: string to synthesize
+    Returns: path to output WAV
     """
     if not voice_path or not os.path.exists(voice_path):
         raise gr.Error("Please upload a short reference voice clip (WAV recommended).")
     if not text or not text.strip():
+        raise gr.Error("Please enter text to synthesize.")
     tts = get_tts()
+    # Temporary output WAV
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         out_path = tmp.name
     tts.infer(voice_path, text.strip(), out_path)
     return out_path
+# Gradio UI
 title = "IndexTTS – Zero-shot Voice Cloning (HF Space)"
 description = """
 Upload a short **reference voice** (5–10s, clean speech works best) and enter text.
+This Space runs **IndexTTS** in CPU mode by default, so first run may take a while to warm up.
 """
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
+            voice = gr.Audio(sources=["upload"], type="filepath", label="Reference Voice (WAV preferred)")
+            text = gr.Textbox(label="Text to Synthesize", placeholder="Hello, how are you?", lines=3)
             btn = gr.Button("Generate Speech")
         with gr.Column():
             audio_out = gr.Audio(label="Output Audio", type="filepath")
             log = gr.Markdown("")
     btn.click(fn=synthesize, inputs=[voice, text], outputs=[audio_out])
+# Optional startup preload
 def _startup():
     try:
         get_tts()
+        print("TTS model loaded successfully at startup.")
     except Exception as e:
         print("Warmup failed:", e)
 if __name__ == "__main__":