| | import gradio as gr |
| | from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor |
| | import torch |
| | import torchaudio |
| | from gtts import gTTS |
| | import os |
| |
|
| | |
| | asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") |
| | asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") |
| | chat_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") |
| | chat_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") |
| |
|
| | |
| | def speech_to_text(audio): |
| | if not audio: |
| | return "No audio provided." |
| |
|
| | |
| | input_audio, original_sample_rate = torchaudio.load(audio) |
| |
|
| | |
| | target_sample_rate = 16000 |
| | if original_sample_rate != target_sample_rate: |
| | resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate) |
| | input_audio = resampler(input_audio) |
| |
|
| | |
| | input_audio = asr_processor(input_audio, sampling_rate=target_sample_rate, return_tensors="pt", padding=True) |
| | logits = asr_model(input_audio.input_values).logits |
| | predicted_ids = torch.argmax(logits, dim=-1) |
| | transcription = asr_processor.decode(predicted_ids[0]) |
| | return transcription |
| |
|
| |
|
| | |
| | def chatbot_response(user_input, history): |
| | inputs = chat_tokenizer.encode(history + user_input + chat_tokenizer.eos_token, return_tensors="pt") |
| | response_ids = chat_model.generate(inputs, max_length=500, pad_token_id=chat_tokenizer.eos_token_id) |
| | response = chat_tokenizer.decode(response_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True) |
| | return response |
| |
|
| | |
| | def text_to_speech(text, filename="response.mp3"): |
| | tts = gTTS(text=text, lang="en") |
| | tts.save(filename) |
| | return filename |
| |
|
| | |
| | def englishia(audio, history=""): |
| | |
| | if not audio: |
| | response = "I didn't catch that. Please try speaking again." |
| | audio_response = text_to_speech(response) |
| | return audio_response, history |
| |
|
| | |
| | user_text = speech_to_text(audio) |
| |
|
| | |
| | bot_response = chatbot_response(user_text, history) |
| |
|
| | |
| | audio_response = text_to_speech(bot_response) |
| |
|
| | |
| | history += f"User: {user_text}\nEnglishia: {bot_response}\n" |
| |
|
| | return audio_response, history |
| |
|
| |
|
| | |
| | with gr.Blocks() as englishia_interface: |
| | gr.Markdown("# Welcome to Englishia: Your English Practice Assistant") |
| |
|
| | with gr.Row(): |
| | user_audio = gr.Audio(type="filepath", label="Speak to Englishia") |
| | chatbot_output = gr.Audio(label="Englishia's Response") |
| | |
| | conversation_history = gr.Textbox(label="Conversation History", lines=10, interactive=False) |
| |
|
| | submit_button = gr.Button("Submit") |
| | submit_button.click( |
| | englishia, |
| | inputs=[user_audio, conversation_history], |
| | outputs=[chatbot_output, conversation_history] |
| | ) |
| |
|
| | |
| | englishia_interface.launch() |
| |
|