Englishia / app.py
Abbas133's picture
Update app.py
354513a verified
import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio
from gtts import gTTS
import os
# Load models
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
chat_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
chat_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
# Function: Speech-to-Text (STT)
def speech_to_text(audio):
if not audio:
return "No audio provided."
# Load the audio file
input_audio, original_sample_rate = torchaudio.load(audio)
# Resample the audio to 16,000 Hz if necessary
target_sample_rate = 16000
if original_sample_rate != target_sample_rate:
resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
input_audio = resampler(input_audio)
# Process the audio for ASR
input_audio = asr_processor(input_audio, sampling_rate=target_sample_rate, return_tensors="pt", padding=True)
logits = asr_model(input_audio.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = asr_processor.decode(predicted_ids[0])
return transcription
# Function: Generate chatbot response
def chatbot_response(user_input, history):
inputs = chat_tokenizer.encode(history + user_input + chat_tokenizer.eos_token, return_tensors="pt")
response_ids = chat_model.generate(inputs, max_length=500, pad_token_id=chat_tokenizer.eos_token_id)
response = chat_tokenizer.decode(response_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)
return response
# Function: Text-to-Speech (TTS)
def text_to_speech(text, filename="response.mp3"):
tts = gTTS(text=text, lang="en")
tts.save(filename)
return filename
# Main Chat Function
def englishia(audio, history=""):
# Step 1: Check if audio is provided
if not audio:
response = "I didn't catch that. Please try speaking again."
audio_response = text_to_speech(response)
return audio_response, history
# Step 2: Convert speech to text
user_text = speech_to_text(audio)
# Step 3: Generate chatbot response
bot_response = chatbot_response(user_text, history)
# Step 4: Convert chatbot response to speech
audio_response = text_to_speech(bot_response)
# Update conversation history
history += f"User: {user_text}\nEnglishia: {bot_response}\n"
return audio_response, history
# Gradio Interface
with gr.Blocks() as englishia_interface:
gr.Markdown("# Welcome to Englishia: Your English Practice Assistant")
with gr.Row():
user_audio = gr.Audio(type="filepath", label="Speak to Englishia")
chatbot_output = gr.Audio(label="Englishia's Response")
conversation_history = gr.Textbox(label="Conversation History", lines=10, interactive=False)
submit_button = gr.Button("Submit")
submit_button.click(
englishia,
inputs=[user_audio, conversation_history],
outputs=[chatbot_output, conversation_history]
)
# Launch the App
englishia_interface.launch()