Spaces:
Sleeping
Sleeping
| import torch | |
| import gradio as gr | |
| import whisper | |
| from gtts import gTTS | |
| from pydub import AudioSegment | |
| import tempfile | |
| import os | |
| from transformers import MBartForConditionalGeneration, MBart50Tokenizer | |
| # Load Whisper model | |
| whisper_model = whisper.load_model("base") | |
| # Load mBART | |
| model_name = "facebook/mbart-large-50-many-to-many-mmt" | |
| tokenizer = MBart50Tokenizer.from_pretrained(model_name) | |
| model = MBartForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu") | |
| # Target language | |
| TARGET_LANG = "hi_IN" # Hindi | |
| def respond(prompt_text, audio_file): | |
| transcription = None | |
| try: | |
| if prompt_text and prompt_text.strip(): | |
| final_prompt = prompt_text.strip() | |
| elif audio_file: | |
| sound = AudioSegment.from_file(audio_file) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpwav: | |
| sound.export(tmpwav.name, format="wav") | |
| transcription = whisper_model.transcribe(tmpwav.name)["text"] | |
| final_prompt = transcription | |
| else: | |
| return "No prompt provided", "", None | |
| # Generate response | |
| tokenizer.src_lang = "en_XX" | |
| encoded = tokenizer(final_prompt, return_tensors="pt").to(model.device) | |
| generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id[TARGET_LANG], max_new_tokens=100) | |
| translated = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) | |
| # TTS | |
| tts = gTTS(translated, lang='hi') | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
| tts.save(fp.name) | |
| audio_path = fp.name | |
| return transcription if transcription else "Typed input used", translated, audio_path | |
| except Exception as e: | |
| return f"Error: {str(e)}", "", None | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Chat with Vidhya") as demo: | |
| gr.Markdown(""" | |
| # 🧠 Chat with Vidhya | |
| **An AI assistant that listens to your voice or reads your text, and responds in your language.** | |
| """) | |
| with gr.Row(): | |
| txt_input = gr.Textbox(lines=2, label="Type your prompt (optional)") | |
| audio_input = gr.Audio(type="filepath", label="Or speak your prompt") | |
| with gr.Row(): | |
| transcription_output = gr.Textbox(label="Transcribed Speech") | |
| text_output = gr.Textbox(label="Model's Response") | |
| audio_output = gr.Audio(type="filepath", label="Spoken Response") | |
| submit_btn = gr.Button("Submit") | |
| submit_btn.click(fn=respond, inputs=[txt_input, audio_input], outputs=[transcription_output, text_output, audio_output]) | |
| demo.launch() | |