import gradio as gr from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, SpeechT5Processor, SpeechT5ForTextToSpeech import torch import torchaudio # Load Speech-to-Text Model stt_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") # Load Text-to-Speech Model tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") # Function to convert speech to text def speech_to_text(audio): waveform, sample_rate = torchaudio.load(audio) if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = resampler(waveform) input_values = stt_processor(waveform.squeeze(), return_tensors="pt", sampling_rate=16000).input_values logits = stt_model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = stt_processor.decode(predicted_ids[0]) return transcription # Function to convert text to speech def text_to_speech(text): inputs = tts_processor(text=text, return_tensors="pt") speech = tts_model.generate_speech(inputs["input_ids"]) return (16000, speech.numpy()) # Function to handle the entire flow def learn_english(audio, progress=gr.Progress()): # Show loading indicator progress(0, desc="Processing your audio...") # Convert speech to text progress(0.5, desc="Transcribing your speech...") user_input = speech_to_text(audio) # Generate feedback progress(0.75, desc="Generating feedback...") feedback = f"You said: '{user_input}'. Great job! Let's practice more." audio_feedback = text_to_speech(feedback) # Return results with a success message return feedback, audio_feedback, "✅ Feedback generated successfully!" # Custom CSS for styling custom_css = """ /* Import Google Fonts */ @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap'); @import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css'); /* General Styling */ body { font-family: 'Roboto', sans-serif; background-color: #f4f4f9; margin: 0; padding: 0; } /* Header Styling */ h1 { color: #333; text-align: center; margin-bottom: 20px; font-weight: 700; animation: fadeIn 1s ease-in-out; } /* Button Styling */ .gr-button { background-color: #ff6f61; /* Vibrant red-orange */ color: white; border: none; padding: 12px 24px; border-radius: 25px; /* Rounded corners */ font-size: 16px; cursor: pointer; transition: all 0.3s ease; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); position: relative; overflow: hidden; } .gr-button:hover { background-color: #ff3b2f; /* Darker red on hover */ transform: translateY(-3px); /* Slight upward movement */ box-shadow: 0 6px 10px rgba(0, 0, 0, 0.2); } .gr-button:active { transform: translateY(0); box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); } /* Bouncing Animation */ @keyframes bounce { 0%, 100% { transform: translateY(0); } 50% { transform: translateY(-10px); } } .gr-button.bounce { animation: bounce 0.5s ease infinite; } /* Input and Output Boxes */ .gr-box { border-radius: 15px; border: 2px solid #ddd; padding: 20px; box-shadow: 0 8px 16px rgba(0, 0, 0, 0.1); background-color: #fff; animation: slideIn 0.8s ease-in-out; } /* Audio Feedback */ .gr-audio { margin-top: 15px; } /* Markdown Styling */ .markdown-body { font-size: 18px; line-height: 1.6; color: #555; animation: fadeIn 1s ease-in-out; } /* Icons */ .icon { font-size: 24px; margin-right: 10px; color: #ff6f61; } /* Success Message */ .success-message { color: #28a745; font-weight: bold; text-align: center; margin-top: 15px; animation: fadeIn 1s ease-in-out; } /* Animations */ @keyframes fadeIn { from { opacity: 0; } to { opacity: 1; } } @keyframes slideIn { from { transform: translateY(20px); opacity: 0; } to { transform: translateY(0); opacity: 1; } } /* Responsive Design */ @media (max-width: 768px) { .gr-row { flex-direction: column; } .gr-column { width: 100%; margin-bottom: 20px; } h1 { font-size: 24px; } .gr-button { width: 100%; padding: 12px; } } """ # Create Gradio Interface with Custom CSS with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo: gr.Markdown(""" # 🌟 English Language Learning App 🌟 Welcome to the English Language Learning App! - Speak into the microphone, and the app will transcribe your speech. - It will then provide feedback in both text and audio formats. """) with gr.Row(): with gr.Column(scale=1): # Left column for input gr.Markdown(" ### Step 1: Record Your Voice") audio_input = gr.Audio( type="filepath", label="🎤 Speak into the Microphone", interactive=True ) submit_button = gr.Button(" Submit", variant="primary", elem_classes=["bounce"]) with gr.Column(scale=2): # Right column for output gr.Markdown(" ### Step 2: View Feedback") text_output = gr.Textbox( label="📝 Transcription", placeholder="Your transcription will appear here...", lines=3 ) audio_output = gr.Audio( label="🎧 Audio Feedback", autoplay=True ) success_message = gr.HTML("", elem_classes=["success-message"]) # Bind the button to the function submit_button.click( learn_english, inputs=audio_input, outputs=[text_output, audio_output, success_message] ) # Launch the app demo.launch(server_name="0.0.0.0", server_port=7860)