File size: 3,342 Bytes
0d79937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import torch
from transformers import pipeline
import whisper
from gtts import gTTS
import os
import time

# --- 1. CONFIGURATION ---
# Using the Ultra-Fast 0.5B model (Switch to "1.5B" if you want it smarter)
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"

print(f"⏳ Loading {MODEL_ID}...")

pipe = pipeline(
    "text-generation",
    model=MODEL_ID,
    model_kwargs={"low_cpu_mem_usage": True},
    device_map="cpu", 
)

whisper_model = whisper.load_model("tiny.en")

print("✅ Systems Ready.")

# --- 2. TEACHER PERSONA (UPDATED FOR GRAMMAR) ---
SYSTEM_PROMPT = """
You are a helpful English teacher.
1. If the user makes a grammar mistake, ALWAYS correct it first. Start with "Correction: [Correct Sentence]".
2. Then, answer the user's question or continue the chat.
3. Keep your own English simple (A2 level).
4. Keep responses short (max 2 sentences).
"""

# --- 3. PROCESSING FUNCTIONS ---

def text_to_speech(text):
    try:
        if not text: return None
        # Remove "Correction:" tag so the voice flows better
        # text = text.replace("Correction:", "You should say:") 
        
        # Clean up for TTS
        text_clean = text.replace("*", "").replace("#", "")
        
        tts = gTTS(text_clean, lang='en')
        filename = f"response_{int(time.time())}.mp3"
        tts.save(filename)
        return filename
    except:
        return None

def generate_response(message, history):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    
    # Keep history short for speed
    for user_msg, bot_msg in history[-2:]:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    
    messages.append({"role": "user", "content": message})

    outputs = pipe(
        messages,
        max_new_tokens=60, # Slightly increased to allow room for corrections
        do_sample=True,
        temperature=0.6, # Lower temperature = more accurate grammar
    )
    
    return outputs[0]["generated_text"][-1]["content"]

def conversation_logic(audio_path, text_input, history):
    user_text = ""

    # 1. Transcribe
    if audio_path:
        result = whisper_model.transcribe(audio_path)
        user_text = result["text"]
    elif text_input:
        user_text = text_input
    else:
        return history, None, ""

    if not user_text.strip():
        return history, None, ""

    # 2. Think
    ai_response = generate_response(user_text, history)

    # 3. Speak
    ai_audio = text_to_speech(ai_response)

    # 4. Update UI
    history.append((user_text, ai_response))
    return history, ai_audio, ""

# --- 4. UI SETUP ---

with gr.Blocks(title="Grammar Tutor") as demo:
    gr.Markdown(f"# ⚡ Grammar & Speaking Tutor")
    gr.Markdown("I will correct your mistakes and chat with you.")
    
    chatbot = gr.Chatbot(label="Conversation")
    
    with gr.Row():
        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="🎤 Speak")
        text_in = gr.Textbox(label="⌨️ Type")
        
    submit_btn = gr.Button("Send", variant="primary")
    audio_out = gr.Audio(label="Teacher's Voice", autoplay=True)

    submit_btn.click(
        fn=conversation_logic,
        inputs=[audio_in, text_in, chatbot],
        outputs=[chatbot, audio_out, text_in]
    )

demo.launch()