drixo commited on
Commit
0f1723d
Β·
verified Β·
1 Parent(s): 7b5105b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -75
app.py CHANGED
@@ -1,81 +1,15 @@
1
- import gradio as gr
2
- from transformers import MarianMTModel, MarianTokenizer, pipeline
3
- import torch
4
- import numpy as np
5
  from huggingface_hub import snapshot_download
6
- from indextts.infer import IndexTTS
7
-
8
- # --------------------------
9
- # Download Index-TTS from Hugging Face
10
- # --------------------------
11
- snapshot_download("IndexTeam/Index-TTS", local_dir="checkpoints")
12
-
13
- # Initialize TTS
14
- tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
15
-
16
- # --------------------------
17
- # Translation models
18
- # --------------------------
19
- language_models = {
20
- "Spanish β†’ English": "Helsinki-NLP/opus-mt-es-en",
21
- "English β†’ Spanish": "Helsinki-NLP/opus-mt-en-es"
22
- }
23
- current_model_name = language_models["Spanish β†’ English"]
24
- tokenizer = MarianTokenizer.from_pretrained(current_model_name)
25
- model = MarianMTModel.from_pretrained(current_model_name)
26
-
27
- # --------------------------
28
- # Speech-to-text
29
- # --------------------------
30
- asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
31
 
32
- # --------------------------
33
- # Helpers
34
- # --------------------------
35
- def text_to_speech(text: str, ref_audio_path):
36
- output_path = "output.wav"
37
- tts.infer(ref_audio_path, text, output_path)
38
- # Load waveform for Gradio
39
- import soundfile as sf
40
- data, samplerate = sf.read(output_path)
41
- return samplerate, data
42
 
43
- def translate_with_voice(audio, lang_pair, ref_voice):
44
- text_input = asr(audio)["text"]
45
-
46
- global tokenizer, model, current_model_name
47
- if language_models[lang_pair] != current_model_name:
48
- current_model_name = language_models[lang_pair]
49
- tokenizer = MarianTokenizer.from_pretrained(current_model_name)
50
- model = MarianMTModel.from_pretrained(current_model_name)
51
-
52
- inputs = tokenizer(text_input, return_tensors="pt", padding=True)
53
- translated = model.generate(**inputs)
54
- translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
55
-
56
- sr, audio_array = text_to_speech(translated_text, ref_audio_path=ref_voice)
57
- return translated_text, (sr, audio_array)
58
-
59
- # --------------------------
60
- # Gradio UI
61
- # --------------------------
62
- with gr.Blocks() as demo:
63
- gr.Markdown("## πŸ—£ Voice-Cloned Translator (English ↔ Spanish)")
64
- with gr.Row():
65
- with gr.Column():
66
- audio_input = gr.Audio(sources=["microphone"], type="filepath", label="πŸŽ™ Speak")
67
- lang_dropdown = gr.Dropdown(list(language_models.keys()), label="🌍 Target Language", value="Spanish β†’ English")
68
- ref_voice_input = gr.Audio(sources=["upload"], type="filepath", label="🎧 Reference Voice (5–10s)")
69
- btn = gr.Button("Translate & Speak")
70
- with gr.Column():
71
- text_output = gr.Textbox(label="Translated Text")
72
- audio_output = gr.Audio(label="πŸ”Š Translated Audio", type="numpy")
73
 
74
- btn.click(
75
- fn=translate_with_voice,
76
- inputs=[audio_input, lang_dropdown, ref_voice_input],
77
- outputs=[text_output, audio_output]
78
- )
79
 
80
- demo.launch()
 
 
81
 
 
 
1
+ # Download model
 
 
 
2
  from huggingface_hub import snapshot_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ snapshot_download(IndexTeam/Index-TTS, local_dir="checkpoints")
 
 
 
 
 
 
 
 
 
5
 
6
+ from indextts.infer import IndexTTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Ensure config.yaml is present in the checkpoints directory
9
+ tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
 
 
 
10
 
11
+ voice = "path/to/your/reference_voice.wav" # Path to the voice reference audio file
12
+ text = "Hello, how are you?"
13
+ output_path = "output_index.wav"
14
 
15
+ tts.infer(voice, text, output_path)