Spaces:
Build error
Build error
| import gradio as gr | |
| import whisper | |
| import torch | |
| import os | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from TTS.api import TTS | |
| # ========================= | |
| # ENV FIXES (VERY IMPORTANT) | |
| # ========================= | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # ========================= | |
| # LOAD MODELS (ONCE) | |
| # ========================= | |
| # Whisper ASR | |
| asr_model = whisper.load_model("small").to(device) | |
| # NLLB Translation | |
| NLLB_MODEL = "facebook/nllb-200-distilled-600M" | |
| tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL) | |
| translator = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL).to(device) | |
| # XTTS Voice Cloning | |
| tts = TTS( | |
| model_name="tts_models/multilingual/multi-dataset/xtts_v2", | |
| gpu=torch.cuda.is_available() | |
| ) | |
| # ========================= | |
| # LANGUAGE MAPS | |
| # ========================= | |
| WHISPER_LANG = { | |
| "English": "en", | |
| "Hindi": "hi", | |
| "Marathi": "mr", | |
| "Tamil": "ta" | |
| } | |
| NLLB_LANG = { | |
| "English": "eng_Latn", | |
| "Hindi": "hin_Deva", | |
| "Marathi": "mar_Deva", | |
| "Tamil": "tam_Taml" | |
| } | |
| XTTS_LANG = { | |
| "English": "en", | |
| "Hindi": "hi", | |
| "Marathi": "mr", | |
| "Tamil": "ta" | |
| } | |
| # ========================= | |
| # PIPELINE | |
| # ========================= | |
| def speech_translate_clone(audio_path, speaker_wav, src_lang, tgt_lang): | |
| if audio_path is None: | |
| return "No audio provided", None | |
| if speaker_wav is None: | |
| return "Upload a speaker voice sample", None | |
| # 1️⃣ ASR | |
| asr = asr_model.transcribe( | |
| audio_path, | |
| language=WHISPER_LANG[src_lang] | |
| ) | |
| source_text = asr.get("text", "").strip() | |
| if not source_text: | |
| return "No speech detected", None | |
| # 2️⃣ TRANSLATION | |
| tokenizer.src_lang = NLLB_LANG[src_lang] | |
| inputs = tokenizer(source_text, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| tokens = translator.generate( | |
| **inputs, | |
| forced_bos_token_id=tokenizer.convert_tokens_to_ids( | |
| NLLB_LANG[tgt_lang] | |
| ), | |
| max_length=256 | |
| ) | |
| translated_text = tokenizer.decode(tokens[0], skip_special_tokens=True) | |
| # 3️⃣ XTTS VOICE CLONING | |
| out_path = "output.wav" | |
| tts.tts_to_file( | |
| text=translated_text, | |
| speaker_wav=speaker_wav, | |
| language=XTTS_LANG[tgt_lang], | |
| file_path=out_path | |
| ) | |
| return translated_text, out_path | |
| # ========================= | |
| # UI | |
| # ========================= | |
| with gr.Blocks(title="Multilingual Voice Translation + Cloning") as demo: | |
| gr.Markdown("# 🎙 Multilingual Voice Translation + Voice Cloning") | |
| gr.Markdown("Speech → Translation → Same Voice Output (XTTS v2)") | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", | |
| label="Speak" | |
| ) | |
| speaker_wav = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Upload Speaker Voice (3–10 sec clear voice)" | |
| ) | |
| with gr.Row(): | |
| src_lang = gr.Dropdown( | |
| ["English", "Hindi", "Marathi", "Tamil"], | |
| value="English", | |
| label="Input Language" | |
| ) | |
| tgt_lang = gr.Dropdown( | |
| ["English", "Hindi", "Marathi", "Tamil"], | |
| value="Hindi", | |
| label="Output Language" | |
| ) | |
| btn = gr.Button("Translate + Clone Voice") | |
| text_out = gr.Textbox(label="Translated Text") | |
| audio_out = gr.Audio(label="Cloned Voice Output") | |
| btn.click( | |
| fn=speech_translate_clone, | |
| inputs=[audio_input, speaker_wav, src_lang, tgt_lang], | |
| outputs=[text_out, audio_out] | |
| ) | |
| # IMPORTANT FOR HUGGING FACE | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |