Spaces:

drixo
/

Translator

Sleeping

App Files Files Community

drixo commited on Sep 3, 2025

Commit

25f2399

verified ·

1 Parent(s): 5049836

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -9

app.py CHANGED Viewed

@@ -1,15 +1,80 @@
-# Download model
 from huggingface_hub import snapshot_download
-snapshot_download(IndexTeam/Index-TTS, local_dir="checkpoints")
 from indextts.infer import IndexTTS
-# Ensure config.yaml is present in the checkpoints directory
 tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
-voice = "path/to/your/reference_voice.wav"  # Path to the voice reference audio file
-text = "Hello, how are you?"
-output_path = "output_index.wav"
-tts.infer(voice, text, output_path)

+import gradio as gr
+from transformers import MarianMTModel, MarianTokenizer, pipeline
+import torch
+import numpy as np
 from huggingface_hub import snapshot_download
 from indextts.infer import IndexTTS
+import soundfile as sf
+# --------------------------
+# Download Index-TTS model from Hugging Face
+# --------------------------
+snapshot_download("IndexTeam/Index-TTS", local_dir="checkpoints")
 tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
+# --------------------------
+# Translation models
+# --------------------------
+language_models = {
+    "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
+    "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
+}
+current_model_name = language_models["Spanish → English"]
+tokenizer = MarianTokenizer.from_pretrained(current_model_name)
+model = MarianMTModel.from_pretrained(current_model_name)
+# --------------------------
+# Speech-to-text
+# --------------------------
+asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
+# --------------------------
+# Helper functions
+# --------------------------
+def text_to_speech(text: str, ref_audio_path):
+    output_path = "output.wav"
+    tts.infer(ref_audio_path, text, output_path)
+    data, samplerate = sf.read(output_path)
+    return samplerate, data
+def translate_with_voice(audio, lang_pair, ref_voice):
+    # 1️⃣ Speech-to-text
+    text_input = asr(audio)["text"]
+    # 2️⃣ Translate
+    global tokenizer, model, current_model_name
+    if language_models[lang_pair] != current_model_name:
+        current_model_name = language_models[lang_pair]
+        tokenizer = MarianTokenizer.from_pretrained(current_model_name)
+        model = MarianMTModel.from_pretrained(current_model_name)
+    inputs = tokenizer(text_input, return_tensors="pt", padding=True)
+    translated = model.generate(**inputs)
+    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
+    # 3️⃣ Text-to-speech
+    sr, audio_array = text_to_speech(translated_text, ref_audio_path=ref_voice)
+    return translated_text, (sr, audio_array)
+# --------------------------
+# Gradio UI
+# --------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("## 🗣 Voice-Cloned Translator (English ↔ Spanish)")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="🎙 Speak")
+            lang_dropdown = gr.Dropdown(list(language_models.keys()), label="🌍 Target Language", value="Spanish → English")
+            ref_voice_input = gr.Audio(sources=["upload"], type="filepath", label="🎧 Reference Voice (5–10s)")
+            btn = gr.Button("Translate & Speak")
+        with gr.Column():
+            text_output = gr.Textbox(label="Translated Text")
+            audio_output = gr.Audio(label="🔊 Translated Audio", type="numpy")
+    btn.click(
+        fn=translate_with_voice,
+        inputs=[audio_input, lang_dropdown, ref_voice_input],
+        outputs=[text_output, audio_output]
+    )
+demo.launch()