Spaces:

drixo
/

Translator

Sleeping

App Files Files Community

drixo commited on Sep 4, 2025

Commit

1f7b49e

verified ·

1 Parent(s): 813795e

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -70

app.py CHANGED Viewed

@@ -1,84 +1,127 @@
 import gradio as gr
-import soundfile as sf
-import torch
-import sys, os
-from transformers import MarianMTModel, MarianTokenizer, pipeline
 from huggingface_hub import snapshot_download
-# --------------------------
-# Download Index-TTS repo from Hugging Face
-# --------------------------
-repo_path = snapshot_download("IndexTeam/Index-TTS", local_dir="checkpoints")
-sys.path.append(repo_path)
 from indextts.infer import IndexTTS
-# Init TTS
-tts = IndexTTS(model_dir=repo_path, cfg_path=os.path.join(repo_path, "config.yaml"))
-# --------------------------
-# Translation models
-# --------------------------
-language_models = {
-    "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
-    "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
-}
-current_model_name = language_models["Spanish → English"]
-tokenizer = MarianTokenizer.from_pretrained(current_model_name)
-model = MarianMTModel.from_pretrained(current_model_name)
-# Speech-to-text (ASR)
-asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
-# --------------------------
-# Functions
-# --------------------------
-def text_to_speech(text, ref_voice):
-    output_path = "output.wav"
-    tts.infer(ref_voice, text, output_path)
-    data, samplerate = sf.read(output_path)
-    return samplerate, data
-def translate_with_voice(audio, lang_pair, ref_voice):
-    # 1) Speech to text
-    text_input = asr(audio)["text"]
-    # 2) Translation
-    global tokenizer, model, current_model_name
-    if language_models[lang_pair] != current_model_name:
-        current_model_name = language_models[lang_pair]
-        tokenizer = MarianTokenizer.from_pretrained(current_model_name)
-        model = MarianMTModel.from_pretrained(current_model_name)
-    inputs = tokenizer(text_input, return_tensors="pt", padding=True)
-    translated = model.generate(**inputs)
-    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
-    # 3) Text to speech
-    sr, audio_array = text_to_speech(translated_text, ref_voice)
-    return translated_text, (sr, audio_array)
-# --------------------------
-# Gradio UI
-# --------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 🗣 Voice-Cloned Translator (English ↔ Spanish)")
     with gr.Row():
         with gr.Column():
-            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="🎙 Speak")
-            lang_dropdown = gr.Dropdown(list(language_models.keys()), label="🌍 Target Language", value="Spanish → English")
-            ref_voice_input = gr.Audio(sources=["upload"], type="filepath", label="🎧 Reference Voice (5–10s)")
-            btn = gr.Button("Translate & Speak")
         with gr.Column():
-            text_output = gr.Textbox(label="Translated Text")
-            audio_output = gr.Audio(label="🔊 Translated Audio", type="numpy")
-    btn.click(
-        fn=translate_with_voice,
-        inputs=[audio_input, lang_dropdown, ref_voice_input],
-        outputs=[text_output, audio_output]
-    )
-demo.launch()

+import os
+import tempfile
 import gradio as gr
 from huggingface_hub import snapshot_download
+# If torch is optional for you, you can keep this minimal
+import torch
+# Import after deps are installed (handled by requirements.txt)
 from indextts.infer import IndexTTS
+CHECKPOINTS_DIR = os.path.abspath("checkpoints")
+def load_model():
+    """
+    Download model weights (if needed) and initialize IndexTTS once.
+    Avoids the 'checkpoints/checkpoints' double-path bug by using the exact
+    path returned from snapshot_download.
+    """
+    os.makedirs(CHECKPOINTS_DIR, exist_ok=True)
+    # Download to a fixed directory; do NOT prefix this path again later.
+    repo_path = snapshot_download(
+        repo_id="mlx-community/IndexTTS",
+        local_dir=CHECKPOINTS_DIR,
+        local_dir_use_symlinks=False,   # ensures real files (safer in Spaces)
+        allow_patterns=[
+            "config.yaml",
+            "bpe.model",
+            "unigram_12000.vocab",
+            "gpt.pth",
+            "bigvgan_generator.pth",
+            "bigvgan_discriminator.pth",
+            "dvae.pth",
+        ],
+    )
+    # Optional: keep CPU stable in Spaces and prevent over-threading
+    os.environ.setdefault("OMP_NUM_THREADS", "1")
+    os.environ.setdefault("MKL_NUM_THREADS", "1")
+    try:
+        torch.set_num_threads(1)
+    except Exception:
+        pass
+    # Initialize IndexTTS. IMPORTANT: pass repo_path directly.
+    tts = IndexTTS(model_dir=repo_path, cfg_path=os.path.join(repo_path, "config.yaml"))
+    return tts
+# Global singleton (loaded once on Space startup)
+_tts = None
+def get_tts():
+    global _tts
+    if _tts is None:
+        _tts = load_model()
+    return _tts
+def synthesize(voice_path, text):
+    """
+    Gradio inference function.
+    - voice_path: path to uploaded reference voice (WAV strongly recommended)
+    - text: the text to speak
+    Returns (output_wav_path)
+    """
+    if not voice_path or not os.path.exists(voice_path):
+        raise gr.Error("Please upload a short reference voice clip (WAV recommended).")
+    if not text or not text.strip():
+        raise gr.Error("Please enter the text to speak.")
+    tts = get_tts()
+    # Write output to a temporary WAV file; Gradio will serve it.
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        out_path = tmp.name
+    # Minimal call; IndexTTS handles normalization/phonemization internally.
+    # You can add extra kwargs if the library exposes them (e.g., speed, seed).
+    tts.infer(voice_path, text.strip(), out_path)
+    return out_path
+title = "IndexTTS – Zero-shot Voice Cloning (HF Space)"
+description = """
+Upload a short **reference voice** (5–10s, clean speech works best) and enter text.
+This Space runs **IndexTTS** in CPU mode by default, so first run may take a bit to warm up.
+"""
 with gr.Blocks() as demo:
+    gr.Markdown(f"# {title}\n{description}")
     with gr.Row():
         with gr.Column():
+            voice = gr.Audio(
+                sources=["upload"],
+                type="filepath",
+                label="Reference Voice (WAV preferred)"
+            )
+            text = gr.Textbox(
+                label="Text to Synthesize",
+                placeholder="Hello, how are you?",
+                lines=3
+            )
+            btn = gr.Button("Generate Speech")
         with gr.Column():
+            audio_out = gr.Audio(label="Output Audio", type="filepath")
+            log = gr.Markdown("")
+    btn.click(fn=synthesize, inputs=[voice, text], outputs=[audio_out])
+# Optional: pre-load at startup so first user call is faster
+def _startup():
+    try:
+        get_tts()
+    except Exception as e:
+        # Don't crash the Space if warmup fails; show a note in Logs.
+        print("Warmup failed:", e)
+if __name__ == "__main__":
+    _startup()
+    demo.launch(server_name="0.0.0.0", server_port=7860)