import gradio as gr import whisper from transformers import MarianMTModel, MarianTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer from TTS.api import TTS import torch import os import tempfile # ----------------------------- # Model Loading Section # ----------------------------- # Load Whisper model print("Loading Whisper model...") stt_model = whisper.load_model("tiny") # Use "tiny" for faster performance and lower resource usage print("Whisper model loaded.") # Function to load MarianMT models dynamically and cache them translation_models = {} def get_translation_model(src_lang, tgt_lang): """ Dynamically load and cache MarianMT translation models based on source and target languages. """ key = f"{src_lang}-{tgt_lang}" if key in translation_models: return translation_models[key] model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}' try: tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) translation_models[key] = (tokenizer, model) print(f"Loaded translation model: {model_name}") return tokenizer, model except Exception as e: print(f"Translation model {model_name} not found. Error: {e}") return None, None # Load Language Model (GPT-Neo) print("Loading Language Model...") lm_model_name = "EleutherAI/gpt-neo-125M" # Smaller model suitable for free tier lm_tokenizer = AutoTokenizer.from_pretrained(lm_model_name) lm_model = AutoModelForCausalLM.from_pretrained(lm_model_name) print("Language model loaded.") # Load TTS model print("Loading TTS model...") tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False) print("TTS model loaded.") # ----------------------------- # Function Definitions # ----------------------------- def speech_to_text(audio_path): """ Transcribe audio to text and detect language using Whisper. """ result = stt_model.transcribe(audio_path) text = result["text"] detected_lang = result["language"] print(f"Transcribed Text: {text}") print(f"Detected Language: {detected_lang}") return text, detected_lang def translate_text(text, src_lang, tgt_lang='en'): """ Translate text from src_lang to tgt_lang using MarianMT. """ if src_lang == tgt_lang: print("No translation needed.") return text tokenizer, model = get_translation_model(src_lang, tgt_lang) if tokenizer is None or model is None: print(f"No translation model found for {src_lang} to {tgt_lang}. Returning original text.") return text # Return original text if translation model not found inputs = tokenizer(text, return_tensors="pt", padding=True) translated = model.generate(**inputs) translated_text = tokenizer.decode(translated[0], skip_special_tokens=True) print(f"Translated Text ({src_lang} -> {tgt_lang}): {translated_text}") return translated_text def generate_response(prompt): """ Generate a response using the language model. """ inputs = lm_tokenizer(prompt, return_tensors="pt") outputs = lm_model.generate(inputs.input_ids, max_length=150, do_sample=True, temperature=0.7) response = lm_tokenizer.decode(outputs[0], skip_special_tokens=True) response = response[len(prompt):].strip() print(f"AI Response: {response}") return response def text_to_speech(text, lang='en'): """ Convert text to speech using Coqui TTS. """ if lang != 'en': # Extend with multilingual TTS models as needed print(f"TTS for language '{lang}' not implemented. Using English TTS.") with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tts_model.tts_to_file(text=text, file_path=tmp.name) print(f"Generated TTS audio at: {tmp.name}") return tmp.name def process_audio(audio): """ Full processing pipeline: Speech-to-Text -> Translate -> Generate Response -> Translate Back -> Text-to-Speech """ # Check file size (e.g., limit to 10MB) if audio.size > 10 * 1024 * 1024: print("Uploaded audio file is too large.") return None # Or return an error message/audio # Save uploaded audio to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(audio.read()) tmp_path = tmp.name print(f"Audio saved to temporary file: {tmp_path}") try: # Step 1: Speech-to-Text user_text, detected_lang = speech_to_text(tmp_path) # Step 2: Translate to English translated_text = translate_text(user_text, src_lang=detected_lang, tgt_lang='en') # Step 3: Generate Response ai_response = generate_response(translated_text) # Step 4: Translate Back to User's Language translated_response = translate_text(ai_response, src_lang='en', tgt_lang=detected_lang) # Step 5: Text-to-Speech response_audio_path = text_to_speech(translated_response, lang=detected_lang) # Read the generated audio with open(response_audio_path, "rb") as f: response_audio = f.read() except Exception as e: print(f"Error during processing: {e}") # Optionally, return an error message or a default audio response return None finally: # Clean up temporary files os.remove(tmp_path) if 'response_audio_path' in locals() and os.path.exists(response_audio_path): os.remove(response_audio_path) print("Temporary files cleaned up.") return response_audio # ----------------------------- # Gradio Interface Definition # ----------------------------- iface = gr.Interface( fn=process_audio, inputs=gr.Audio(source="upload", type="file", label="Upload Your Audio"), outputs=gr.Audio(type="file", label="AI Response"), title="Multilingual Voice Interaction", description="Upload an audio file in any supported language. The system will respond with an audio reply in the same language.", examples=[ # To add examples, upload example audio files to your Space and reference their paths here # ["example1.wav"], # ["example2.wav"], ], allow_flagging="never", # Disable flagging to prevent misuse ) if __name__ == "__main__": iface.launch()