import gradio as gr import torch import numpy as np from PIL import Image import base64 import io from transformers import ( CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM, ) import pyttsx3 import json from pathlib import Path # ============================================ # CONFIGURATION # ============================================ DEVICE = "cpu" TORCH_DTYPE = torch.float32 # Model names (CPU-optimized) CLIP_MODEL_NAME = "openai/clip-vit-base-patch32" LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" # ============================================ # INITIALIZE MODELS (Global, loaded once) # ============================================ print("[INFO] Loading CLIP model...") clip_model = CLIPModel.from_pretrained( CLIP_MODEL_NAME, torch_dtype=TORCH_DTYPE, device_map=DEVICE ).to(DEVICE).eval() clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME) print("[INFO] Loading LLM (Qwen2.5-1.5B)...") llm_tokenizer = AutoTokenizer.from_pretrained( LLM_MODEL_NAME, trust_remote_code=True ) llm_model = AutoModelForCausalLM.from_pretrained( LLM_MODEL_NAME, torch_dtype=TORCH_DTYPE, device_map=DEVICE, trust_remote_code=True, low_cpu_mem_usage=True ).to(DEVICE).eval() print("[INFO] Initializing TTS...") tts_engine = pyttsx3.init() tts_engine.setProperty('rate', 150) # Speech rate # ============================================ # HELPER FUNCTIONS # ============================================ def analyze_screenshot_with_clip(image: Image.Image) -> dict: """Use CLIP to understand what's on the screen.""" with torch.no_grad(): # Resize for faster processing image = image.resize((224, 224), Image.Resampling.LANCZOS) inputs = clip_processor( images=image, return_tensors="pt", padding=True ).to(DEVICE) image_features = clip_model.get_image_features(**inputs) # Classify what's on screen labels = [ "Python code editor", "JavaScript code", "HTML/CSS markup", "Terminal/console output", "Error message", "Browser DevTools", "IDE or text editor", "File explorer", "Command line", "Documentation page" ] text_inputs = clip_processor( text=labels, return_tensors="pt", padding=True ).to(DEVICE) text_features = clip_model.get_text_features(**text_inputs) logits_per_image = image_features @ text_features.t() probs = logits_per_image.softmax(dim=-1).cpu().numpy()[0] top_idx = np.argmax(probs) top_label = labels[top_idx] confidence = float(probs[top_idx]) return { "detected_context": top_label, "confidence": confidence, } def generate_beginner_guidance( user_query: str, screen_context: str, history: list ) -> str: """Generate beginner-friendly explanation using LLM.""" # Build history text history_text = "" for i, msg in enumerate(history[-4:]): # Last 4 messages if msg["role"] == "user": history_text += f"User: {msg['content']}\n" else: history_text += f"Assistant: {msg['content']}\n" # System prompt system_prompt = """You are an expert coding tutor teaching beginners. Your rules: 1. Explain like they've never coded before - define every term 2. Use analogies - relate coding concepts to real-world things 3. Break it down - never give full solutions, only next small step 4. Be encouraging - celebrate small wins 5. Use simple language - avoid jargon without explanation 6. Give code examples - show concrete examples when relevant Current screen context: {context} User's question: {query} Provide a step-by-step explanation (2-3 short paragraphs maximum). Be friendly and encouraging.""" prompt = system_prompt.format(context=screen_context, query=user_query) if history_text: prompt += f"\n\nPrevious conversation:\n{history_text}" # Generate messages = [{"role": "user", "content": prompt}] with torch.no_grad(): text = llm_tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = llm_tokenizer( text, return_tensors="pt", padding=True ).to(DEVICE) generated_ids = llm_model.generate( **model_inputs, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=llm_tokenizer.eos_token_id ) response = llm_tokenizer.decode( generated_ids[0][model_inputs.input_ids.shape[1]:], skip_special_tokens=True ) return response.strip() def text_to_speech(text: str, speed: float = 1.0) -> str: """Convert text to speech using pyttsx3.""" try: # Adjust speed rate = int(150 * speed) tts_engine.setProperty('rate', max(50, min(300, rate))) # Save to temporary file temp_file = "/tmp/speech.wav" tts_engine.save_to_file(text, temp_file) tts_engine.runAndWait() return temp_file except Exception as e: print(f"[ERROR] TTS failed: {e}") return None # ============================================ # GRADIO INTERFACE # ============================================ def coder_tutor( screenshot: Image.Image, user_query: str, speech_speed: float, history_json: str ): """Main tutor function.""" if screenshot is None: return "❌ Please upload a screenshot", "", "" try: # Parse history try: history = json.loads(history_json) if history_json else [] except: history = [] # 1. Analyze screenshot print("[INFO] Analyzing screenshot...") analysis = analyze_screenshot_with_clip(screenshot) screen_context = analysis["detected_context"] # 2. Generate guidance print("[INFO] Generating guidance...") guidance = generate_beginner_guidance( user_query=user_query or "What should I do next?", screen_context=screen_context, history=history ) # 3. Generate speech print("[INFO] Generating speech...") audio_file = text_to_speech(guidance, speed=speech_speed) # 4. Update history new_history = history + [ {"role": "user", "content": user_query}, {"role": "assistant", "content": guidance} ] return guidance, audio_file, json.dumps(new_history) except Exception as e: return f"❌ Error: {str(e)}", "", "" # ============================================ # BUILD GRADIO INTERFACE # ============================================ with gr.Blocks(title="Coder Tutor", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎓 Coder Tutor Real-time AI coaching for learning to code. **How to use:** 1. 📸 Upload a screenshot of your screen 2. ❓ Ask a question (e.g., "What's a function?") 3. 🎧 Get explanation + hear audio guidance 4. 🔄 Keep the conversation going with more questions """) with gr.Row(): with gr.Column(): # Inputs screenshot = gr.Image( label="📸 Screenshot", type="pil", scale=1 ) user_query = gr.Textbox( label="❓ Your Question", placeholder="E.g., 'What is a function?' or 'How do I fix this error?'", lines=2 ) speech_speed = gr.Slider( label="🎧 Speech Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1 ) submit_btn = gr.Button("🚀 Get Help", scale=2, variant="primary") with gr.Column(): # Outputs guidance = gr.Textbox( label="💬 Guidance", lines=8, interactive=False ) audio_output = gr.Audio( label="🔊 Listen to Explanation", type="filepath" ) confidence = gr.Textbox( label="📊 Detected Context", interactive=False ) # Hidden state for conversation history history_state = gr.State(value="[]") # Button click handler def on_submit(screenshot, query, speed, history_json): guidance, audio, new_history = coder_tutor( screenshot, query, speed, history_json ) return guidance, audio, new_history submit_btn.click( on_submit, inputs=[screenshot, user_query, speech_speed, history_state], outputs=[guidance, audio_output, history_state] ) gr.Markdown(""" --- **Tips for Best Results:** - Be specific: "Explain for loops" works better than "help" - Include relevant code in your screenshot - Adjust speech speed for your learning pace - One concept at a time - master it before moving on """) if __name__ == "__main__": demo.launch()