Spaces:

drixo
/

Translator

Sleeping

App Files Files Community

drixo commited on Sep 4, 2025

Commit

057f29d

verified ·

1 Parent(s): 2a191ea

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -100

app.py CHANGED Viewed

@@ -1,113 +1,84 @@
-import os
-import tempfile
 import gradio as gr
-from huggingface_hub import snapshot_download
 import torch
-from indextts.infer import IndexTTS
-# Directory to store downloaded model files
-CHECKPOINTS_DIR = os.path.abspath("checkpoints")
-def load_model():
-    """
-    Download IndexTTS model weights (if needed) and initialize IndexTTS once.
-    """
-    os.makedirs(CHECKPOINTS_DIR, exist_ok=True)
-    # Download weights from HF Hub
-    repo_path = snapshot_download(
-        repo_id="mlx-community/IndexTTS",
-        local_dir=CHECKPOINTS_DIR,
-        local_dir_use_symlinks=False,
-        allow_patterns=[
-            "config.yaml",
-            "bpe.model",
-            "unigram_12000.vocab",
-            "gpt.pth",
-            "bigvgan_generator.pth",
-            "bigvgan_discriminator.pth",
-            "dvae.pth",
-        ],
-    )
-    # Debug: verify files
-    print("Downloaded files:", os.listdir(repo_path))
-    cfg_file = os.path.join(repo_path, "config.yaml")
-    if not os.path.exists(cfg_file):
-        raise FileNotFoundError(f"Cannot find config.yaml in {repo_path}. Check repo contents.")
-    # Limit CPU threads for Spaces
-    os.environ.setdefault("OMP_NUM_THREADS", "1")
-    os.environ.setdefault("MKL_NUM_THREADS", "1")
-    try:
-        torch.set_num_threads(1)
-    except Exception:
-        pass
-    # Initialize IndexTTS
-    tts = IndexTTS(model_dir=repo_path, cfg_path=cfg_file)
-    return tts
-# Global singleton for TTS
-_tts = None
-def get_tts():
-    global _tts
-    if _tts is None:
-        _tts = load_model()
-    return _tts
-def synthesize(voice_path, text):
-    """
-    Gradio inference function.
-    voice_path: path to reference voice (WAV recommended)
-    text: string to synthesize
-    Returns: path to output WAV
-    """
-    if not voice_path or not os.path.exists(voice_path):
-        raise gr.Error("Please upload a short reference voice clip (WAV recommended).")
-    if not text or not text.strip():
-        raise gr.Error("Please enter text to synthesize.")
-    tts = get_tts()
-    # Temporary output WAV
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-        out_path = tmp.name
-    tts.infer(voice_path, text.strip(), out_path)
-    return out_path
 # Gradio UI
-title = "IndexTTS – Zero-shot Voice Cloning (HF Space)"
-description = """
-Upload a short **reference voice** (5–10s, clean speech works best) and enter text.
-This Space runs **IndexTTS** in CPU mode by default, so first run may take a while to warm up.
-"""
 with gr.Blocks() as demo:
-    gr.Markdown(f"# {title}\n{description}")
     with gr.Row():
         with gr.Column():
-            voice = gr.Audio(sources=["upload"], type="filepath", label="Reference Voice (WAV preferred)")
-            text = gr.Textbox(label="Text to Synthesize", placeholder="Hello, how are you?", lines=3)
-            btn = gr.Button("Generate Speech")
-        with gr.Column():
-            audio_out = gr.Audio(label="Output Audio", type="filepath")
-            log = gr.Markdown("")
-    btn.click(fn=synthesize, inputs=[voice, text], outputs=[audio_out])
-# Optional startup preload
-def _startup():
-    try:
-        get_tts()
-        print("TTS model loaded successfully at startup.")
-    except Exception as e:
-        print("Warmup failed:", e)
-if __name__ == "__main__":
-    _startup()
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+import soundfile as sf
 import torch
+import sys, os
+from transformers import MarianMTModel, MarianTokenizer, pipeline
+from huggingface_hub import snapshot_download
+# --------------------------
+# Download Index-TTS repo from Hugging Face
+# --------------------------
+repo_path = snapshot_download("IndexTeam/Index-TTS", local_dir="checkpoints")
+sys.path.append(repo_path)
+from indextts.infer import IndexTTS
+# Init TTS
+tts = IndexTTS(model_dir=repo_path, cfg_path=os.path.join(repo_path, "config.yaml"))
+# --------------------------
+# Translation models
+# --------------------------
+language_models = {
+    "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
+    "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
+}
+current_model_name = language_models["Spanish → English"]
+tokenizer = MarianTokenizer.from_pretrained(current_model_name)
+model = MarianMTModel.from_pretrained(current_model_name)
+# Speech-to-text (ASR)
+asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
+# --------------------------
+# Functions
+# --------------------------
+def text_to_speech(text, ref_voice):
+    output_path = "output.wav"
+    tts.infer(ref_voice, text, output_path)
+    data, samplerate = sf.read(output_path)
+    return samplerate, data
+def translate_with_voice(audio, lang_pair, ref_voice):
+    # 1) Speech to text
+    text_input = asr(audio)["text"]
+    # 2) Translation
+    global tokenizer, model, current_model_name
+    if language_models[lang_pair] != current_model_name:
+        current_model_name = language_models[lang_pair]
+        tokenizer = MarianTokenizer.from_pretrained(current_model_name)
+        model = MarianMTModel.from_pretrained(current_model_name)
+    inputs = tokenizer(text_input, return_tensors="pt", padding=True)
+    translated = model.generate(**inputs)
+    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
+    # 3) Text to speech
+    sr, audio_array = text_to_speech(translated_text, ref_voice)
+    return translated_text, (sr, audio_array)
+# --------------------------
 # Gradio UI
+# --------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## 🗣 Voice-Cloned Translator (English ↔ Spanish)")
     with gr.Row():
         with gr.Column():
+            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="🎙 Speak")
+            lang_dropdown = gr.Dropdown(list(language_models.keys()), label="🌍 Target Language", value="Spanish → English")
+            ref_voice_input = gr.Audio(sources=["upload"], type="filepath", label="🎧 Reference Voice (5–10s)")
+            btn = gr.Button("Translate & Speak")
+        with gr.Column():
+            text_output = gr.Textbox(label="Translated Text")
+            audio_output = gr.Audio(label="🔊 Translated Audio", type="numpy")
+    btn.click(
+        fn=translate_with_voice,
+        inputs=[audio_input, lang_dropdown, ref_voice_input],
+        outputs=[text_output, audio_output]
+    )
+demo.launch()