Spaces:
Build error
Build error
| import gradio as gr | |
| import whisper | |
| from transformers import MarianMTModel, MarianTokenizer | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from TTS.api import TTS | |
| import torch | |
| import os | |
| import tempfile | |
| # ----------------------------- | |
| # Model Loading Section | |
| # ----------------------------- | |
| # Load Whisper model | |
| print("Loading Whisper model...") | |
| stt_model = whisper.load_model("tiny") # Use "tiny" for faster performance and lower resource usage | |
| print("Whisper model loaded.") | |
| # Function to load MarianMT models dynamically and cache them | |
| translation_models = {} | |
| def get_translation_model(src_lang, tgt_lang): | |
| """ | |
| Dynamically load and cache MarianMT translation models based on source and target languages. | |
| """ | |
| key = f"{src_lang}-{tgt_lang}" | |
| if key in translation_models: | |
| return translation_models[key] | |
| model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}' | |
| try: | |
| tokenizer = MarianTokenizer.from_pretrained(model_name) | |
| model = MarianMTModel.from_pretrained(model_name) | |
| translation_models[key] = (tokenizer, model) | |
| print(f"Loaded translation model: {model_name}") | |
| return tokenizer, model | |
| except Exception as e: | |
| print(f"Translation model {model_name} not found. Error: {e}") | |
| return None, None | |
| # Load Language Model (GPT-Neo) | |
| print("Loading Language Model...") | |
| lm_model_name = "EleutherAI/gpt-neo-125M" # Smaller model suitable for free tier | |
| lm_tokenizer = AutoTokenizer.from_pretrained(lm_model_name) | |
| lm_model = AutoModelForCausalLM.from_pretrained(lm_model_name) | |
| print("Language model loaded.") | |
| # Load TTS model | |
| print("Loading TTS model...") | |
| tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False) | |
| print("TTS model loaded.") | |
| # ----------------------------- | |
| # Function Definitions | |
| # ----------------------------- | |
| def speech_to_text(audio_path): | |
| """ | |
| Transcribe audio to text and detect language using Whisper. | |
| """ | |
| result = stt_model.transcribe(audio_path) | |
| text = result["text"] | |
| detected_lang = result["language"] | |
| print(f"Transcribed Text: {text}") | |
| print(f"Detected Language: {detected_lang}") | |
| return text, detected_lang | |
| def translate_text(text, src_lang, tgt_lang='en'): | |
| """ | |
| Translate text from src_lang to tgt_lang using MarianMT. | |
| """ | |
| if src_lang == tgt_lang: | |
| print("No translation needed.") | |
| return text | |
| tokenizer, model = get_translation_model(src_lang, tgt_lang) | |
| if tokenizer is None or model is None: | |
| print(f"No translation model found for {src_lang} to {tgt_lang}. Returning original text.") | |
| return text # Return original text if translation model not found | |
| inputs = tokenizer(text, return_tensors="pt", padding=True) | |
| translated = model.generate(**inputs) | |
| translated_text = tokenizer.decode(translated[0], skip_special_tokens=True) | |
| print(f"Translated Text ({src_lang} -> {tgt_lang}): {translated_text}") | |
| return translated_text | |
| def generate_response(prompt): | |
| """ | |
| Generate a response using the language model. | |
| """ | |
| inputs = lm_tokenizer(prompt, return_tensors="pt") | |
| outputs = lm_model.generate(inputs.input_ids, max_length=150, do_sample=True, temperature=0.7) | |
| response = lm_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| response = response[len(prompt):].strip() | |
| print(f"AI Response: {response}") | |
| return response | |
| def text_to_speech(text, lang='en'): | |
| """ | |
| Convert text to speech using Coqui TTS. | |
| """ | |
| if lang != 'en': | |
| # Extend with multilingual TTS models as needed | |
| print(f"TTS for language '{lang}' not implemented. Using English TTS.") | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| tts_model.tts_to_file(text=text, file_path=tmp.name) | |
| print(f"Generated TTS audio at: {tmp.name}") | |
| return tmp.name | |
| def process_audio(audio): | |
| """ | |
| Full processing pipeline: Speech-to-Text -> Translate -> Generate Response -> Translate Back -> Text-to-Speech | |
| """ | |
| # Check file size (e.g., limit to 10MB) | |
| if audio.size > 10 * 1024 * 1024: | |
| print("Uploaded audio file is too large.") | |
| return None # Or return an error message/audio | |
| # Save uploaded audio to a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| tmp.write(audio.read()) | |
| tmp_path = tmp.name | |
| print(f"Audio saved to temporary file: {tmp_path}") | |
| try: | |
| # Step 1: Speech-to-Text | |
| user_text, detected_lang = speech_to_text(tmp_path) | |
| # Step 2: Translate to English | |
| translated_text = translate_text(user_text, src_lang=detected_lang, tgt_lang='en') | |
| # Step 3: Generate Response | |
| ai_response = generate_response(translated_text) | |
| # Step 4: Translate Back to User's Language | |
| translated_response = translate_text(ai_response, src_lang='en', tgt_lang=detected_lang) | |
| # Step 5: Text-to-Speech | |
| response_audio_path = text_to_speech(translated_response, lang=detected_lang) | |
| # Read the generated audio | |
| with open(response_audio_path, "rb") as f: | |
| response_audio = f.read() | |
| except Exception as e: | |
| print(f"Error during processing: {e}") | |
| # Optionally, return an error message or a default audio response | |
| return None | |
| finally: | |
| # Clean up temporary files | |
| os.remove(tmp_path) | |
| if 'response_audio_path' in locals() and os.path.exists(response_audio_path): | |
| os.remove(response_audio_path) | |
| print("Temporary files cleaned up.") | |
| return response_audio | |
| # ----------------------------- | |
| # Gradio Interface Definition | |
| # ----------------------------- | |
| iface = gr.Interface( | |
| fn=process_audio, | |
| inputs=gr.Audio(source="upload", type="file", label="Upload Your Audio"), | |
| outputs=gr.Audio(type="file", label="AI Response"), | |
| title="Multilingual Voice Interaction", | |
| description="Upload an audio file in any supported language. The system will respond with an audio reply in the same language.", | |
| examples=[ | |
| # To add examples, upload example audio files to your Space and reference their paths here | |
| # ["example1.wav"], | |
| # ["example2.wav"], | |
| ], | |
| allow_flagging="never", # Disable flagging to prevent misuse | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |