Tutor / app.py
Adedoyinjames's picture
Update app.py
6511d12 verified
import gradio as gr
import torch
import numpy as np
from PIL import Image
import base64
import io
from transformers import (
CLIPProcessor, CLIPModel,
AutoTokenizer, AutoModelForCausalLM,
)
import pyttsx3
import json
from pathlib import Path
# ============================================
# CONFIGURATION
# ============================================
DEVICE = "cpu"
TORCH_DTYPE = torch.float32
# Model names (CPU-optimized)
CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"
LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
# ============================================
# INITIALIZE MODELS (Global, loaded once)
# ============================================
print("[INFO] Loading CLIP model...")
clip_model = CLIPModel.from_pretrained(
CLIP_MODEL_NAME,
torch_dtype=TORCH_DTYPE,
device_map=DEVICE
).to(DEVICE).eval()
clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
print("[INFO] Loading LLM (Qwen2.5-1.5B)...")
llm_tokenizer = AutoTokenizer.from_pretrained(
LLM_MODEL_NAME,
trust_remote_code=True
)
llm_model = AutoModelForCausalLM.from_pretrained(
LLM_MODEL_NAME,
torch_dtype=TORCH_DTYPE,
device_map=DEVICE,
trust_remote_code=True,
low_cpu_mem_usage=True
).to(DEVICE).eval()
print("[INFO] Initializing TTS...")
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', 150) # Speech rate
# ============================================
# HELPER FUNCTIONS
# ============================================
def analyze_screenshot_with_clip(image: Image.Image) -> dict:
"""Use CLIP to understand what's on the screen."""
with torch.no_grad():
# Resize for faster processing
image = image.resize((224, 224), Image.Resampling.LANCZOS)
inputs = clip_processor(
images=image,
return_tensors="pt",
padding=True
).to(DEVICE)
image_features = clip_model.get_image_features(**inputs)
# Classify what's on screen
labels = [
"Python code editor",
"JavaScript code",
"HTML/CSS markup",
"Terminal/console output",
"Error message",
"Browser DevTools",
"IDE or text editor",
"File explorer",
"Command line",
"Documentation page"
]
text_inputs = clip_processor(
text=labels,
return_tensors="pt",
padding=True
).to(DEVICE)
text_features = clip_model.get_text_features(**text_inputs)
logits_per_image = image_features @ text_features.t()
probs = logits_per_image.softmax(dim=-1).cpu().numpy()[0]
top_idx = np.argmax(probs)
top_label = labels[top_idx]
confidence = float(probs[top_idx])
return {
"detected_context": top_label,
"confidence": confidence,
}
def generate_beginner_guidance(
user_query: str,
screen_context: str,
history: list
) -> str:
"""Generate beginner-friendly explanation using LLM."""
# Build history text
history_text = ""
for i, msg in enumerate(history[-4:]): # Last 4 messages
if msg["role"] == "user":
history_text += f"User: {msg['content']}\n"
else:
history_text += f"Assistant: {msg['content']}\n"
# System prompt
system_prompt = """You are an expert coding tutor teaching beginners. Your rules:
1. Explain like they've never coded before - define every term
2. Use analogies - relate coding concepts to real-world things
3. Break it down - never give full solutions, only next small step
4. Be encouraging - celebrate small wins
5. Use simple language - avoid jargon without explanation
6. Give code examples - show concrete examples when relevant
Current screen context: {context}
User's question: {query}
Provide a step-by-step explanation (2-3 short paragraphs maximum). Be friendly and encouraging."""
prompt = system_prompt.format(context=screen_context, query=user_query)
if history_text:
prompt += f"\n\nPrevious conversation:\n{history_text}"
# Generate
messages = [{"role": "user", "content": prompt}]
with torch.no_grad():
text = llm_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = llm_tokenizer(
text,
return_tensors="pt",
padding=True
).to(DEVICE)
generated_ids = llm_model.generate(
**model_inputs,
max_new_tokens=200,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=llm_tokenizer.eos_token_id
)
response = llm_tokenizer.decode(
generated_ids[0][model_inputs.input_ids.shape[1]:],
skip_special_tokens=True
)
return response.strip()
def text_to_speech(text: str, speed: float = 1.0) -> str:
"""Convert text to speech using pyttsx3."""
try:
# Adjust speed
rate = int(150 * speed)
tts_engine.setProperty('rate', max(50, min(300, rate)))
# Save to temporary file
temp_file = "/tmp/speech.wav"
tts_engine.save_to_file(text, temp_file)
tts_engine.runAndWait()
return temp_file
except Exception as e:
print(f"[ERROR] TTS failed: {e}")
return None
# ============================================
# GRADIO INTERFACE
# ============================================
def coder_tutor(
screenshot: Image.Image,
user_query: str,
speech_speed: float,
history_json: str
):
"""Main tutor function."""
if screenshot is None:
return "❌ Please upload a screenshot", "", ""
try:
# Parse history
try:
history = json.loads(history_json) if history_json else []
except:
history = []
# 1. Analyze screenshot
print("[INFO] Analyzing screenshot...")
analysis = analyze_screenshot_with_clip(screenshot)
screen_context = analysis["detected_context"]
# 2. Generate guidance
print("[INFO] Generating guidance...")
guidance = generate_beginner_guidance(
user_query=user_query or "What should I do next?",
screen_context=screen_context,
history=history
)
# 3. Generate speech
print("[INFO] Generating speech...")
audio_file = text_to_speech(guidance, speed=speech_speed)
# 4. Update history
new_history = history + [
{"role": "user", "content": user_query},
{"role": "assistant", "content": guidance}
]
return guidance, audio_file, json.dumps(new_history)
except Exception as e:
return f"❌ Error: {str(e)}", "", ""
# ============================================
# BUILD GRADIO INTERFACE
# ============================================
with gr.Blocks(title="Coder Tutor", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸŽ“ Coder Tutor
Real-time AI coaching for learning to code.
**How to use:**
1. πŸ“Έ Upload a screenshot of your screen
2. ❓ Ask a question (e.g., "What's a function?")
3. 🎧 Get explanation + hear audio guidance
4. πŸ”„ Keep the conversation going with more questions
""")
with gr.Row():
with gr.Column():
# Inputs
screenshot = gr.Image(
label="πŸ“Έ Screenshot",
type="pil",
scale=1
)
user_query = gr.Textbox(
label="❓ Your Question",
placeholder="E.g., 'What is a function?' or 'How do I fix this error?'",
lines=2
)
speech_speed = gr.Slider(
label="🎧 Speech Speed",
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1
)
submit_btn = gr.Button("πŸš€ Get Help", scale=2, variant="primary")
with gr.Column():
# Outputs
guidance = gr.Textbox(
label="πŸ’¬ Guidance",
lines=8,
interactive=False
)
audio_output = gr.Audio(
label="πŸ”Š Listen to Explanation",
type="filepath"
)
confidence = gr.Textbox(
label="πŸ“Š Detected Context",
interactive=False
)
# Hidden state for conversation history
history_state = gr.State(value="[]")
# Button click handler
def on_submit(screenshot, query, speed, history_json):
guidance, audio, new_history = coder_tutor(
screenshot, query, speed, history_json
)
return guidance, audio, new_history
submit_btn.click(
on_submit,
inputs=[screenshot, user_query, speech_speed, history_state],
outputs=[guidance, audio_output, history_state]
)
gr.Markdown("""
---
**Tips for Best Results:**
- Be specific: "Explain for loops" works better than "help"
- Include relevant code in your screenshot
- Adjust speech speed for your learning pace
- One concept at a time - master it before moving on
""")
if __name__ == "__main__":
demo.launch()