import gradio as gr from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor import torch import torchaudio from gtts import gTTS import os # Load models asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") chat_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") chat_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") # Function: Speech-to-Text (STT) def speech_to_text(audio): if not audio: return "No audio provided." # Load the audio file input_audio, original_sample_rate = torchaudio.load(audio) # Resample the audio to 16,000 Hz if necessary target_sample_rate = 16000 if original_sample_rate != target_sample_rate: resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate) input_audio = resampler(input_audio) # Process the audio for ASR input_audio = asr_processor(input_audio, sampling_rate=target_sample_rate, return_tensors="pt", padding=True) logits = asr_model(input_audio.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = asr_processor.decode(predicted_ids[0]) return transcription # Function: Generate chatbot response def chatbot_response(user_input, history): inputs = chat_tokenizer.encode(history + user_input + chat_tokenizer.eos_token, return_tensors="pt") response_ids = chat_model.generate(inputs, max_length=500, pad_token_id=chat_tokenizer.eos_token_id) response = chat_tokenizer.decode(response_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True) return response # Function: Text-to-Speech (TTS) def text_to_speech(text, filename="response.mp3"): tts = gTTS(text=text, lang="en") tts.save(filename) return filename # Main Chat Function def englishia(audio, history=""): # Step 1: Check if audio is provided if not audio: response = "I didn't catch that. Please try speaking again." audio_response = text_to_speech(response) return audio_response, history # Step 2: Convert speech to text user_text = speech_to_text(audio) # Step 3: Generate chatbot response bot_response = chatbot_response(user_text, history) # Step 4: Convert chatbot response to speech audio_response = text_to_speech(bot_response) # Update conversation history history += f"User: {user_text}\nEnglishia: {bot_response}\n" return audio_response, history # Gradio Interface with gr.Blocks() as englishia_interface: gr.Markdown("# Welcome to Englishia: Your English Practice Assistant") with gr.Row(): user_audio = gr.Audio(type="filepath", label="Speak to Englishia") chatbot_output = gr.Audio(label="Englishia's Response") conversation_history = gr.Textbox(label="Conversation History", lines=10, interactive=False) submit_button = gr.Button("Submit") submit_button.click( englishia, inputs=[user_audio, conversation_history], outputs=[chatbot_output, conversation_history] ) # Launch the App englishia_interface.launch()