import gradio as gr import torch from transformers import pipeline import whisper from gtts import gTTS import os import time # --- 1. CONFIGURATION --- # Using the Ultra-Fast 0.5B model (Switch to "1.5B" if you want it smarter) MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" print(f"⏳ Loading {MODEL_ID}...") pipe = pipeline( "text-generation", model=MODEL_ID, model_kwargs={"low_cpu_mem_usage": True}, device_map="cpu", ) whisper_model = whisper.load_model("tiny.en") print("✅ Systems Ready.") # --- 2. TEACHER PERSONA (UPDATED FOR GRAMMAR) --- SYSTEM_PROMPT = """ You are a helpful English teacher. 1. If the user makes a grammar mistake, ALWAYS correct it first. Start with "Correction: [Correct Sentence]". 2. Then, answer the user's question or continue the chat. 3. Keep your own English simple (A2 level). 4. Keep responses short (max 2 sentences). """ # --- 3. PROCESSING FUNCTIONS --- def text_to_speech(text): try: if not text: return None # Remove "Correction:" tag so the voice flows better # text = text.replace("Correction:", "You should say:") # Clean up for TTS text_clean = text.replace("*", "").replace("#", "") tts = gTTS(text_clean, lang='en') filename = f"response_{int(time.time())}.mp3" tts.save(filename) return filename except: return None def generate_response(message, history): messages = [{"role": "system", "content": SYSTEM_PROMPT}] # Keep history short for speed for user_msg, bot_msg in history[-2:]: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) outputs = pipe( messages, max_new_tokens=60, # Slightly increased to allow room for corrections do_sample=True, temperature=0.6, # Lower temperature = more accurate grammar ) return outputs[0]["generated_text"][-1]["content"] def conversation_logic(audio_path, text_input, history): user_text = "" # 1. Transcribe if audio_path: result = whisper_model.transcribe(audio_path) user_text = result["text"] elif text_input: user_text = text_input else: return history, None, "" if not user_text.strip(): return history, None, "" # 2. Think ai_response = generate_response(user_text, history) # 3. Speak ai_audio = text_to_speech(ai_response) # 4. Update UI history.append((user_text, ai_response)) return history, ai_audio, "" # --- 4. UI SETUP --- with gr.Blocks(title="Grammar Tutor") as demo: gr.Markdown(f"# ⚡ Grammar & Speaking Tutor") gr.Markdown("I will correct your mistakes and chat with you.") chatbot = gr.Chatbot(label="Conversation") with gr.Row(): audio_in = gr.Audio(sources=["microphone"], type="filepath", label="🎤 Speak") text_in = gr.Textbox(label="⌨️ Type") submit_btn = gr.Button("Send", variant="primary") audio_out = gr.Audio(label="Teacher's Voice", autoplay=True) submit_btn.click( fn=conversation_logic, inputs=[audio_in, text_in, chatbot], outputs=[chatbot, audio_out, text_in] ) demo.launch()