GaneshSarode's picture
Update app.py
b9efa28 verified
import gradio as gr
import whisper
import torch
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from TTS.api import TTS
# =========================
# ENV FIXES (VERY IMPORTANT)
# =========================
os.environ["COQUI_TOS_AGREED"] = "1"
device = "cuda" if torch.cuda.is_available() else "cpu"
# =========================
# LOAD MODELS (ONCE)
# =========================
# Whisper ASR
asr_model = whisper.load_model("small").to(device)
# NLLB Translation
NLLB_MODEL = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
translator = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL).to(device)
# XTTS Voice Cloning
tts = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
gpu=torch.cuda.is_available()
)
# =========================
# LANGUAGE MAPS
# =========================
WHISPER_LANG = {
"English": "en",
"Hindi": "hi",
"Marathi": "mr",
"Tamil": "ta"
}
NLLB_LANG = {
"English": "eng_Latn",
"Hindi": "hin_Deva",
"Marathi": "mar_Deva",
"Tamil": "tam_Taml"
}
XTTS_LANG = {
"English": "en",
"Hindi": "hi",
"Marathi": "mr",
"Tamil": "ta"
}
# =========================
# PIPELINE
# =========================
def speech_translate_clone(audio_path, speaker_wav, src_lang, tgt_lang):
if audio_path is None:
return "No audio provided", None
if speaker_wav is None:
return "Upload a speaker voice sample", None
# 1️⃣ ASR
asr = asr_model.transcribe(
audio_path,
language=WHISPER_LANG[src_lang]
)
source_text = asr.get("text", "").strip()
if not source_text:
return "No speech detected", None
# 2️⃣ TRANSLATION
tokenizer.src_lang = NLLB_LANG[src_lang]
inputs = tokenizer(source_text, return_tensors="pt").to(device)
with torch.no_grad():
tokens = translator.generate(
**inputs,
forced_bos_token_id=tokenizer.convert_tokens_to_ids(
NLLB_LANG[tgt_lang]
),
max_length=256
)
translated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)
# 3️⃣ XTTS VOICE CLONING
out_path = "output.wav"
tts.tts_to_file(
text=translated_text,
speaker_wav=speaker_wav,
language=XTTS_LANG[tgt_lang],
file_path=out_path
)
return translated_text, out_path
# =========================
# UI
# =========================
with gr.Blocks(title="Multilingual Voice Translation + Cloning") as demo:
gr.Markdown("# 🎙 Multilingual Voice Translation + Voice Cloning")
gr.Markdown("Speech → Translation → Same Voice Output (XTTS v2)")
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Speak"
)
speaker_wav = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload Speaker Voice (3–10 sec clear voice)"
)
with gr.Row():
src_lang = gr.Dropdown(
["English", "Hindi", "Marathi", "Tamil"],
value="English",
label="Input Language"
)
tgt_lang = gr.Dropdown(
["English", "Hindi", "Marathi", "Tamil"],
value="Hindi",
label="Output Language"
)
btn = gr.Button("Translate + Clone Voice")
text_out = gr.Textbox(label="Translated Text")
audio_out = gr.Audio(label="Cloned Voice Output")
btn.click(
fn=speech_translate_clone,
inputs=[audio_input, speaker_wav, src_lang, tgt_lang],
outputs=[text_out, audio_out]
)
# IMPORTANT FOR HUGGING FACE
demo.launch(server_name="0.0.0.0", server_port=7860)