Spaces:

Adedoyinjames
/

Tutor

Build error

App Files Files Community

Tutor / app.py

Adedoyinjames

Update app.py

6511d12 verified 22 days ago

raw

history blame contribute delete

9.63 kB

	import gradio as gr
	import torch
	import numpy as np
	from PIL import Image
	import base64
	import io
	from transformers import (
	CLIPProcessor, CLIPModel,
	AutoTokenizer, AutoModelForCausalLM,
	)
	import pyttsx3
	import json
	from pathlib import Path

	# ============================================
	# CONFIGURATION
	# ============================================
	DEVICE = "cpu"
	TORCH_DTYPE = torch.float32

	# Model names (CPU-optimized)
	CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"
	LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

	# ============================================
	# INITIALIZE MODELS (Global, loaded once)
	# ============================================
	print("[INFO] Loading CLIP model...")
	clip_model = CLIPModel.from_pretrained(
	CLIP_MODEL_NAME,
	torch_dtype=TORCH_DTYPE,
	device_map=DEVICE
	).to(DEVICE).eval()
	clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)

	print("[INFO] Loading LLM (Qwen2.5-1.5B)...")
	llm_tokenizer = AutoTokenizer.from_pretrained(
	LLM_MODEL_NAME,
	trust_remote_code=True
	)
	llm_model = AutoModelForCausalLM.from_pretrained(
	LLM_MODEL_NAME,
	torch_dtype=TORCH_DTYPE,
	device_map=DEVICE,
	trust_remote_code=True,
	low_cpu_mem_usage=True
	).to(DEVICE).eval()

	print("[INFO] Initializing TTS...")
	tts_engine = pyttsx3.init()
	tts_engine.setProperty('rate', 150) # Speech rate

	# ============================================
	# HELPER FUNCTIONS
	# ============================================

	def analyze_screenshot_with_clip(image: Image.Image) -> dict:
	"""Use CLIP to understand what's on the screen."""
	with torch.no_grad():
	# Resize for faster processing
	image = image.resize((224, 224), Image.Resampling.LANCZOS)

	inputs = clip_processor(
	images=image,
	return_tensors="pt",
	padding=True
	).to(DEVICE)

	image_features = clip_model.get_image_features(**inputs)

	# Classify what's on screen
	labels = [
	"Python code editor",
	"JavaScript code",
	"HTML/CSS markup",
	"Terminal/console output",
	"Error message",
	"Browser DevTools",
	"IDE or text editor",
	"File explorer",
	"Command line",
	"Documentation page"
	]

	text_inputs = clip_processor(
	text=labels,
	return_tensors="pt",
	padding=True
	).to(DEVICE)

	text_features = clip_model.get_text_features(**text_inputs)
	logits_per_image = image_features @ text_features.t()
	probs = logits_per_image.softmax(dim=-1).cpu().numpy()[0]

	top_idx = np.argmax(probs)
	top_label = labels[top_idx]
	confidence = float(probs[top_idx])

	return {
	"detected_context": top_label,
	"confidence": confidence,
	}

	def generate_beginner_guidance(
	user_query: str,
	screen_context: str,
	history: list
	) -> str:
	"""Generate beginner-friendly explanation using LLM."""

	# Build history text
	history_text = ""
	for i, msg in enumerate(history[-4:]): # Last 4 messages
	if msg["role"] == "user":
	history_text += f"User: {msg['content']}\n"
	else:
	history_text += f"Assistant: {msg['content']}\n"

	# System prompt
	system_prompt = """You are an expert coding tutor teaching beginners. Your rules:

	1. Explain like they've never coded before - define every term
	2. Use analogies - relate coding concepts to real-world things
	3. Break it down - never give full solutions, only next small step
	4. Be encouraging - celebrate small wins
	5. Use simple language - avoid jargon without explanation
	6. Give code examples - show concrete examples when relevant

	Current screen context: {context}
	User's question: {query}

	Provide a step-by-step explanation (2-3 short paragraphs maximum). Be friendly and encouraging."""

	prompt = system_prompt.format(context=screen_context, query=user_query)

	if history_text:
	prompt += f"\n\nPrevious conversation:\n{history_text}"

	# Generate
	messages = [{"role": "user", "content": prompt}]

	with torch.no_grad():
	text = llm_tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	model_inputs = llm_tokenizer(
	text,
	return_tensors="pt",
	padding=True
	).to(DEVICE)

	generated_ids = llm_model.generate(
	**model_inputs,
	max_new_tokens=200,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	pad_token_id=llm_tokenizer.eos_token_id
	)

	response = llm_tokenizer.decode(
	generated_ids[0][model_inputs.input_ids.shape[1]:],
	skip_special_tokens=True
	)

	return response.strip()

	def text_to_speech(text: str, speed: float = 1.0) -> str:
	"""Convert text to speech using pyttsx3."""
	try:
	# Adjust speed
	rate = int(150 * speed)
	tts_engine.setProperty('rate', max(50, min(300, rate)))

	# Save to temporary file
	temp_file = "/tmp/speech.wav"
	tts_engine.save_to_file(text, temp_file)
	tts_engine.runAndWait()

	return temp_file
	except Exception as e:
	print(f"[ERROR] TTS failed: {e}")
	return None

	# ============================================
	# GRADIO INTERFACE
	# ============================================

	def coder_tutor(
	screenshot: Image.Image,
	user_query: str,
	speech_speed: float,
	history_json: str
	):
	"""Main tutor function."""

	if screenshot is None:
	return "❌ Please upload a screenshot", "", ""

	try:
	# Parse history
	try:
	history = json.loads(history_json) if history_json else []
	except:
	history = []

	# 1. Analyze screenshot
	print("[INFO] Analyzing screenshot...")
	analysis = analyze_screenshot_with_clip(screenshot)
	screen_context = analysis["detected_context"]

	# 2. Generate guidance
	print("[INFO] Generating guidance...")
	guidance = generate_beginner_guidance(
	user_query=user_query or "What should I do next?",
	screen_context=screen_context,
	history=history
	)

	# 3. Generate speech
	print("[INFO] Generating speech...")
	audio_file = text_to_speech(guidance, speed=speech_speed)

	# 4. Update history
	new_history = history + [
	{"role": "user", "content": user_query},
	{"role": "assistant", "content": guidance}
	]

	return guidance, audio_file, json.dumps(new_history)

	except Exception as e:
	return f"❌ Error: {str(e)}", "", ""

	# ============================================
	# BUILD GRADIO INTERFACE
	# ============================================

	with gr.Blocks(title="Coder Tutor", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎓 Coder Tutor

	Real-time AI coaching for learning to code.

	How to use:
	1. 📸 Upload a screenshot of your screen
	2. ❓ Ask a question (e.g., "What's a function?")
	3. 🎧 Get explanation + hear audio guidance
	4. 🔄 Keep the conversation going with more questions
	""")

	with gr.Row():
	with gr.Column():
	# Inputs
	screenshot = gr.Image(
	label="📸 Screenshot",
	type="pil",
	scale=1
	)

	user_query = gr.Textbox(
	label="❓ Your Question",
	placeholder="E.g., 'What is a function?' or 'How do I fix this error?'",
	lines=2
	)

	speech_speed = gr.Slider(
	label="🎧 Speech Speed",
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1
	)

	submit_btn = gr.Button("🚀 Get Help", scale=2, variant="primary")

	with gr.Column():
	# Outputs
	guidance = gr.Textbox(
	label="💬 Guidance",
	lines=8,
	interactive=False
	)

	audio_output = gr.Audio(
	label="🔊 Listen to Explanation",
	type="filepath"
	)

	confidence = gr.Textbox(
	label="📊 Detected Context",
	interactive=False
	)

	# Hidden state for conversation history
	history_state = gr.State(value="[]")

	# Button click handler
	def on_submit(screenshot, query, speed, history_json):
	guidance, audio, new_history = coder_tutor(
	screenshot, query, speed, history_json
	)
	return guidance, audio, new_history

	submit_btn.click(
	on_submit,
	inputs=[screenshot, user_query, speech_speed, history_state],
	outputs=[guidance, audio_output, history_state]
	)

	gr.Markdown("""
	---

	Tips for Best Results:
	- Be specific: "Explain for loops" works better than "help"
	- Include relevant code in your screenshot
	- Adjust speech speed for your learning pace
	- One concept at a time - master it before moving on
	""")

	if __name__ == "__main__":
	demo.launch()