drixo commited on
Commit
8aeefb3
·
verified ·
1 Parent(s): b28ff8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -20
app.py CHANGED
@@ -3,64 +3,52 @@ from transformers import MarianMTModel, MarianTokenizer, pipeline
3
  import torch
4
  import numpy as np
5
  from huggingface_hub import hf_hub_download
6
- from indextts.infer import IndexTTS
7
 
8
  # --------------------------
9
- # Translation Model Setup
10
  # --------------------------
11
  language_models = {
12
  "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
13
  "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
14
  }
15
-
16
  current_model_name = language_models["Spanish → English"]
17
  tokenizer = MarianTokenizer.from_pretrained(current_model_name)
18
  model = MarianMTModel.from_pretrained(current_model_name)
19
 
20
  # --------------------------
21
- # ASR (Speech-to-Text)
22
  # --------------------------
23
  asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
24
 
25
  # --------------------------
26
- # IndexTTS Setup (download from Hugging Face Hub)
27
  # --------------------------
28
- ckpt_path = hf_hub_download(
29
- repo_id="IndexTeam/Index-TTS",
30
- filename="checkpoints/index_tts_small.ckpt"
31
- )
32
- cfg_path = hf_hub_download(
33
- repo_id="IndexTeam/Index-TTS",
34
- filename="configs/config.yaml"
35
- )
36
-
37
  tts = IndexTTS(model_dir=ckpt_path, cfg_path=cfg_path)
38
 
39
  # --------------------------
40
- # Helper Functions
41
  # --------------------------
42
  def text_to_speech(text: str, ref_audio_path):
43
- """Convert translated text to speech using reference voice"""
44
  waveform = tts.generate(text, ref_audio=ref_audio_path)
45
  audio_np = waveform.cpu().numpy() if torch.is_tensor(waveform) else np.array(waveform, dtype=np.float32)
46
  return 16000, audio_np
47
 
48
  def translate_with_voice(audio, lang_pair, ref_voice):
49
- """Full pipeline: STT → Translate → TTS with cloned voice"""
50
- # 1️⃣ Speech-to-text
51
  text_input = asr(audio)["text"]
52
 
53
- # 2️⃣ Translate
54
  global tokenizer, model, current_model_name
55
  if language_models[lang_pair] != current_model_name:
56
  current_model_name = language_models[lang_pair]
57
  tokenizer = MarianTokenizer.from_pretrained(current_model_name)
58
  model = MarianMTModel.from_pretrained(current_model_name)
 
59
  inputs = tokenizer(text_input, return_tensors="pt", padding=True)
60
  translated = model.generate(**inputs)
61
  translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
62
 
63
- # 3️⃣ Convert to speech
64
  sr, audio_array = text_to_speech(translated_text, ref_audio_path=ref_voice)
65
  return translated_text, (sr, audio_array)
66
 
 
3
  import torch
4
  import numpy as np
5
  from huggingface_hub import hf_hub_download
6
+ from Index-TTS.infer import IndexTTS # import from local clone
7
 
8
  # --------------------------
9
+ # Translation models
10
  # --------------------------
11
  language_models = {
12
  "Spanish → English": "Helsinki-NLP/opus-mt-es-en",
13
  "English → Spanish": "Helsinki-NLP/opus-mt-en-es"
14
  }
 
15
  current_model_name = language_models["Spanish → English"]
16
  tokenizer = MarianTokenizer.from_pretrained(current_model_name)
17
  model = MarianMTModel.from_pretrained(current_model_name)
18
 
19
  # --------------------------
20
+ # ASR
21
  # --------------------------
22
  asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
23
 
24
  # --------------------------
25
+ # IndexTTS setup
26
  # --------------------------
27
+ ckpt_path = hf_hub_download("IndexTeam/Index-TTS", "checkpoints/index_tts_small.ckpt")
28
+ cfg_path = hf_hub_download("IndexTeam/Index-TTS", "configs/config.yaml")
 
 
 
 
 
 
 
29
  tts = IndexTTS(model_dir=ckpt_path, cfg_path=cfg_path)
30
 
31
  # --------------------------
32
+ # Helpers
33
  # --------------------------
34
  def text_to_speech(text: str, ref_audio_path):
 
35
  waveform = tts.generate(text, ref_audio=ref_audio_path)
36
  audio_np = waveform.cpu().numpy() if torch.is_tensor(waveform) else np.array(waveform, dtype=np.float32)
37
  return 16000, audio_np
38
 
39
  def translate_with_voice(audio, lang_pair, ref_voice):
 
 
40
  text_input = asr(audio)["text"]
41
 
 
42
  global tokenizer, model, current_model_name
43
  if language_models[lang_pair] != current_model_name:
44
  current_model_name = language_models[lang_pair]
45
  tokenizer = MarianTokenizer.from_pretrained(current_model_name)
46
  model = MarianMTModel.from_pretrained(current_model_name)
47
+
48
  inputs = tokenizer(text_input, return_tensors="pt", padding=True)
49
  translated = model.generate(**inputs)
50
  translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
51
 
 
52
  sr, audio_array = text_to_speech(translated_text, ref_audio_path=ref_voice)
53
  return translated_text, (sr, audio_array)
54