Spaces:

drixo
/

Translator

Sleeping

App Files Files Community

drixo commited on Sep 4, 2025

Commit

863b347

verified ·

1 Parent(s): 057f29d

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -29

app.py CHANGED Viewed

@@ -1,20 +1,54 @@
 import gradio as gr
 import soundfile as sf
 import torch
-import sys, os
-from transformers import MarianMTModel, MarianTokenizer, pipeline
 from huggingface_hub import snapshot_download
 # --------------------------
-# Download Index-TTS repo from Hugging Face
 # --------------------------
-repo_path = snapshot_download("IndexTeam/Index-TTS", local_dir="checkpoints")
 sys.path.append(repo_path)
 from indextts.infer import IndexTTS
-# Init TTS
-tts = IndexTTS(model_dir=repo_path, cfg_path=os.path.join(repo_path, "config.yaml"))
 # --------------------------
 # Translation models
@@ -23,46 +57,68 @@ language_models = {
     "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
     "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
 }
-current_model_name = language_models["Spanish → English"]
-tokenizer = MarianTokenizer.from_pretrained(current_model_name)
-model = MarianMTModel.from_pretrained(current_model_name)
 # Speech-to-text (ASR)
 asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 # --------------------------
-# Functions
 # --------------------------
-def text_to_speech(text, ref_voice):
-    output_path = "output.wav"
-    tts.infer(ref_voice, text, output_path)
-    data, samplerate = sf.read(output_path)
-    return samplerate, data
 def translate_with_voice(audio, lang_pair, ref_voice):
     # 1) Speech to text
-    text_input = asr(audio)["text"]
     # 2) Translation
-    global tokenizer, model, current_model_name
-    if language_models[lang_pair] != current_model_name:
-        current_model_name = language_models[lang_pair]
-        tokenizer = MarianTokenizer.from_pretrained(current_model_name)
-        model = MarianMTModel.from_pretrained(current_model_name)
     inputs = tokenizer(text_input, return_tensors="pt", padding=True)
-    translated = model.generate(**inputs)
-    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
     # 3) Text to speech
-    sr, audio_array = text_to_speech(translated_text, ref_voice)
-    return translated_text, (sr, audio_array)
 # --------------------------
 # Gradio UI
 # --------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 🗣 Voice-Cloned Translator (English ↔ Spanish)")
     with gr.Row():
         with gr.Column():
@@ -73,7 +129,7 @@ with gr.Blocks() as demo:
         with gr.Column():
             text_output = gr.Textbox(label="Translated Text")
-            audio_output = gr.Audio(label="🔊 Translated Audio", type="numpy")
     btn.click(
         fn=translate_with_voice,
@@ -81,4 +137,14 @@ with gr.Blocks() as demo:
         outputs=[text_output, audio_output]
     )
-demo.launch()

+import os
+import sys
+import tempfile
 import gradio as gr
 import soundfile as sf
 import torch
 from huggingface_hub import snapshot_download
+from transformers import MarianMTModel, MarianTokenizer, pipeline
 # --------------------------
+# Download IndexTTS repo from Hugging Face
 # --------------------------
+CHECKPOINTS_DIR = os.path.abspath("checkpoints")
+os.makedirs(CHECKPOINTS_DIR, exist_ok=True)
+repo_path = snapshot_download(
+    repo_id="mlx-community/IndexTTS",  # Correct repo
+    local_dir=CHECKPOINTS_DIR,
+    local_dir_use_symlinks=False,
+    allow_patterns=[
+        "config.yaml",
+        "bpe.model",
+        "unigram_12000.vocab",
+        "gpt.pth",
+        "bigvgan_generator.pth",
+        "bigvgan_discriminator.pth",
+        "dvae.pth",
+    ],
+)
 sys.path.append(repo_path)
 from indextts.infer import IndexTTS
+# --------------------------
+# Initialize TTS safely
+# --------------------------
+_tts = None
+def get_tts():
+    global _tts
+    if _tts is None:
+        try:
+            _tts = IndexTTS(model_dir=repo_path, cfg_path=os.path.join(repo_path, "config.yaml"))
+        except FileNotFoundError as e:
+            print("Error loading IndexTTS:", e)
+            raise gr.Error("IndexTTS model files not found!")
+    return _tts
+# Limit CPU threads (important for Spaces)
+torch.set_num_threads(1)
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
 # --------------------------
 # Translation models
     "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
     "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
 }
+current_model_name = None
+tokenizer = None
+model = None
+def load_translation_model(lang_pair):
+    global current_model_name, tokenizer, model
+    if language_models[lang_pair] != current_model_name:
+        current_model_name = language_models[lang_pair]
+        tokenizer = MarianTokenizer.from_pretrained(current_model_name)
+        model = MarianMTModel.from_pretrained(current_model_name)
+# --------------------------
 # Speech-to-text (ASR)
+# --------------------------
 asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 # --------------------------
+# Core functions
 # --------------------------
+def text_to_speech(text, ref_voice_path):
+    """
+    Convert text to speech using IndexTTS.
+    Returns a temporary WAV file path.
+    """
+    tts = get_tts()
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        out_path = tmp.name
+    tts.infer(ref_voice_path, text, out_path)
+    return out_path
 def translate_with_voice(audio, lang_pair, ref_voice):
+    # Handle Gradio sending numpy array + sample_rate
+    if isinstance(audio, tuple):
+        audio_path = audio[0]  # (filepath, sample_rate) or (sample_rate, array)
+    else:
+        audio_path = audio
     # 1) Speech to text
+    text_input = asr(audio_path)["text"]
     # 2) Translation
+    load_translation_model(lang_pair)
     inputs = tokenizer(text_input, return_tensors="pt", padding=True)
+    translated_ids = model.generate(**inputs)
+    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
     # 3) Text to speech
+    out_wav_path = text_to_speech(translated_text, ref_voice)
+    return translated_text, out_wav_path
 # --------------------------
 # Gradio UI
 # --------------------------
+title = "🗣 Voice-Cloned Translator (English ↔ Spanish)"
+description = """
+Upload a short **reference voice** (5–10s, clean speech works best) and speak into the microphone.
+This Space uses **IndexTTS** for zero-shot voice cloning and **Hugging Face models** for translation.
+"""
 with gr.Blocks() as demo:
+    gr.Markdown(f"# {title}\n{description}")
     with gr.Row():
         with gr.Column():
         with gr.Column():
             text_output = gr.Textbox(label="Translated Text")
+            audio_output = gr.Audio(label="🔊 Translated Audio", type="filepath")
     btn.click(
         fn=translate_with_voice,
         outputs=[text_output, audio_output]
     )
+# Preload TTS on startup
+def _startup():
+    try:
+        get_tts()
+    except Exception as e:
+        print("Warmup failed:", e)
+if __name__ == "__main__":
+    _startup()
+    demo.launch(server_name="0.0.0.0", server_port=7860)