drixo commited on
Commit
25f2399
Β·
verified Β·
1 Parent(s): 5049836

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -9
app.py CHANGED
@@ -1,15 +1,80 @@
1
- # Download model
 
 
 
2
  from huggingface_hub import snapshot_download
3
-
4
- snapshot_download(IndexTeam/Index-TTS, local_dir="checkpoints")
5
-
6
  from indextts.infer import IndexTTS
 
7
 
8
- # Ensure config.yaml is present in the checkpoints directory
 
 
 
9
  tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
10
 
11
- voice = "path/to/your/reference_voice.wav" # Path to the voice reference audio file
12
- text = "Hello, how are you?"
13
- output_path = "output_index.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- tts.infer(voice, text, output_path)
 
1
+ import gradio as gr
2
+ from transformers import MarianMTModel, MarianTokenizer, pipeline
3
+ import torch
4
+ import numpy as np
5
  from huggingface_hub import snapshot_download
 
 
 
6
  from indextts.infer import IndexTTS
7
+ import soundfile as sf
8
 
9
+ # --------------------------
10
+ # Download Index-TTS model from Hugging Face
11
+ # --------------------------
12
+ snapshot_download("IndexTeam/Index-TTS", local_dir="checkpoints")
13
  tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
14
 
15
+ # --------------------------
16
+ # Translation models
17
+ # --------------------------
18
+ language_models = {
19
+ "Spanish β†’ English": "Helsinki-NLP/opus-mt-es-en",
20
+ "English β†’ Spanish": "Helsinki-NLP/opus-mt-en-es"
21
+ }
22
+ current_model_name = language_models["Spanish β†’ English"]
23
+ tokenizer = MarianTokenizer.from_pretrained(current_model_name)
24
+ model = MarianMTModel.from_pretrained(current_model_name)
25
+
26
+ # --------------------------
27
+ # Speech-to-text
28
+ # --------------------------
29
+ asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
30
+
31
+ # --------------------------
32
+ # Helper functions
33
+ # --------------------------
34
+ def text_to_speech(text: str, ref_audio_path):
35
+ output_path = "output.wav"
36
+ tts.infer(ref_audio_path, text, output_path)
37
+ data, samplerate = sf.read(output_path)
38
+ return samplerate, data
39
+
40
+ def translate_with_voice(audio, lang_pair, ref_voice):
41
+ # 1️⃣ Speech-to-text
42
+ text_input = asr(audio)["text"]
43
+
44
+ # 2️⃣ Translate
45
+ global tokenizer, model, current_model_name
46
+ if language_models[lang_pair] != current_model_name:
47
+ current_model_name = language_models[lang_pair]
48
+ tokenizer = MarianTokenizer.from_pretrained(current_model_name)
49
+ model = MarianMTModel.from_pretrained(current_model_name)
50
+
51
+ inputs = tokenizer(text_input, return_tensors="pt", padding=True)
52
+ translated = model.generate(**inputs)
53
+ translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
54
+
55
+ # 3️⃣ Text-to-speech
56
+ sr, audio_array = text_to_speech(translated_text, ref_audio_path=ref_voice)
57
+ return translated_text, (sr, audio_array)
58
+
59
+ # --------------------------
60
+ # Gradio UI
61
+ # --------------------------
62
+ with gr.Blocks() as demo:
63
+ gr.Markdown("## πŸ—£ Voice-Cloned Translator (English ↔ Spanish)")
64
+ with gr.Row():
65
+ with gr.Column():
66
+ audio_input = gr.Audio(sources=["microphone"], type="filepath", label="πŸŽ™ Speak")
67
+ lang_dropdown = gr.Dropdown(list(language_models.keys()), label="🌍 Target Language", value="Spanish β†’ English")
68
+ ref_voice_input = gr.Audio(sources=["upload"], type="filepath", label="🎧 Reference Voice (5–10s)")
69
+ btn = gr.Button("Translate & Speak")
70
+ with gr.Column():
71
+ text_output = gr.Textbox(label="Translated Text")
72
+ audio_output = gr.Audio(label="πŸ”Š Translated Audio", type="numpy")
73
+
74
+ btn.click(
75
+ fn=translate_with_voice,
76
+ inputs=[audio_input, lang_dropdown, ref_voice_input],
77
+ outputs=[text_output, audio_output]
78
+ )
79
 
80
+ demo.launch()