Spaces:

drixo
/

Translator

Sleeping

App Files Files Community

drixo commited on Sep 3, 2025

Commit

8aeefb3

verified ·

1 Parent(s): b28ff8a

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -20

app.py CHANGED Viewed

@@ -3,64 +3,52 @@ from transformers import MarianMTModel, MarianTokenizer, pipeline
 import torch
 import numpy as np
 from huggingface_hub import hf_hub_download
-from indextts.infer import IndexTTS
 # --------------------------
-# Translation Model Setup
 # --------------------------
 language_models = {
     "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
     "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
 }
 current_model_name = language_models["Spanish → English"]
 tokenizer = MarianTokenizer.from_pretrained(current_model_name)
 model = MarianMTModel.from_pretrained(current_model_name)
 # --------------------------
-# ASR (Speech-to-Text)
 # --------------------------
 asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 # --------------------------
-# IndexTTS Setup (download from Hugging Face Hub)
 # --------------------------
-ckpt_path = hf_hub_download(
-    repo_id="IndexTeam/Index-TTS",
-    filename="checkpoints/index_tts_small.ckpt"
-)
-cfg_path = hf_hub_download(
-    repo_id="IndexTeam/Index-TTS",
-    filename="configs/config.yaml"
-)
 tts = IndexTTS(model_dir=ckpt_path, cfg_path=cfg_path)
 # --------------------------
-# Helper Functions
 # --------------------------
 def text_to_speech(text: str, ref_audio_path):
-    """Convert translated text to speech using reference voice"""
     waveform = tts.generate(text, ref_audio=ref_audio_path)
     audio_np = waveform.cpu().numpy() if torch.is_tensor(waveform) else np.array(waveform, dtype=np.float32)
     return 16000, audio_np
 def translate_with_voice(audio, lang_pair, ref_voice):
-    """Full pipeline: STT → Translate → TTS with cloned voice"""
-    # 1️⃣ Speech-to-text
     text_input = asr(audio)["text"]
-    # 2️⃣ Translate
     global tokenizer, model, current_model_name
     if language_models[lang_pair] != current_model_name:
         current_model_name = language_models[lang_pair]
         tokenizer = MarianTokenizer.from_pretrained(current_model_name)
         model = MarianMTModel.from_pretrained(current_model_name)
     inputs = tokenizer(text_input, return_tensors="pt", padding=True)
     translated = model.generate(**inputs)
     translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
-    # 3️⃣ Convert to speech
     sr, audio_array = text_to_speech(translated_text, ref_audio_path=ref_voice)
     return translated_text, (sr, audio_array)

 import torch
 import numpy as np
 from huggingface_hub import hf_hub_download
+from Index-TTS.infer import IndexTTS  # import from local clone
 # --------------------------
+# Translation models
 # --------------------------
 language_models = {
     "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
     "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
 }
 current_model_name = language_models["Spanish → English"]
 tokenizer = MarianTokenizer.from_pretrained(current_model_name)
 model = MarianMTModel.from_pretrained(current_model_name)
 # --------------------------
+# ASR
 # --------------------------
 asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 # --------------------------
+# IndexTTS setup
 # --------------------------
+ckpt_path = hf_hub_download("IndexTeam/Index-TTS", "checkpoints/index_tts_small.ckpt")
+cfg_path = hf_hub_download("IndexTeam/Index-TTS", "configs/config.yaml")
 tts = IndexTTS(model_dir=ckpt_path, cfg_path=cfg_path)
 # --------------------------
+# Helpers
 # --------------------------
 def text_to_speech(text: str, ref_audio_path):
     waveform = tts.generate(text, ref_audio=ref_audio_path)
     audio_np = waveform.cpu().numpy() if torch.is_tensor(waveform) else np.array(waveform, dtype=np.float32)
     return 16000, audio_np
 def translate_with_voice(audio, lang_pair, ref_voice):
     text_input = asr(audio)["text"]
     global tokenizer, model, current_model_name
     if language_models[lang_pair] != current_model_name:
         current_model_name = language_models[lang_pair]
         tokenizer = MarianTokenizer.from_pretrained(current_model_name)
         model = MarianMTModel.from_pretrained(current_model_name)
     inputs = tokenizer(text_input, return_tensors="pt", padding=True)
     translated = model.generate(**inputs)
     translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
     sr, audio_array = text_to_speech(translated_text, ref_audio_path=ref_voice)
     return translated_text, (sr, audio_array)