Spaces:

Adedoyinjames
/

Tutor

Build error

File size: 9,634 Bytes

import gradio as gr
import torch
import numpy as np
from PIL import Image
import base64
import io
from transformers import (
    CLIPProcessor, CLIPModel,
    AutoTokenizer, AutoModelForCausalLM,
)
import pyttsx3
import json
from pathlib import Path

# ============================================
# CONFIGURATION
# ============================================
DEVICE = "cpu"
TORCH_DTYPE = torch.float32

# Model names (CPU-optimized)
CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"
LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

# ============================================
# INITIALIZE MODELS (Global, loaded once)
# ============================================
print("[INFO] Loading CLIP model...")
clip_model = CLIPModel.from_pretrained(
    CLIP_MODEL_NAME,
    torch_dtype=TORCH_DTYPE,
    device_map=DEVICE
).to(DEVICE).eval()
clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)

print("[INFO] Loading LLM (Qwen2.5-1.5B)...")
llm_tokenizer = AutoTokenizer.from_pretrained(
    LLM_MODEL_NAME,
    trust_remote_code=True
)
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_NAME,
    torch_dtype=TORCH_DTYPE,
    device_map=DEVICE,
    trust_remote_code=True,
    low_cpu_mem_usage=True
).to(DEVICE).eval()

print("[INFO] Initializing TTS...")
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', 150)  # Speech rate

# ============================================
# HELPER FUNCTIONS
# ============================================

def analyze_screenshot_with_clip(image: Image.Image) -> dict:
    """Use CLIP to understand what's on the screen."""
    with torch.no_grad():
        # Resize for faster processing
        image = image.resize((224, 224), Image.Resampling.LANCZOS)
        
        inputs = clip_processor(
            images=image,
            return_tensors="pt",
            padding=True
        ).to(DEVICE)
        
        image_features = clip_model.get_image_features(**inputs)
        
        # Classify what's on screen
        labels = [
            "Python code editor",
            "JavaScript code",
            "HTML/CSS markup",
            "Terminal/console output",
            "Error message",
            "Browser DevTools",
            "IDE or text editor",
            "File explorer",
            "Command line",
            "Documentation page"
        ]
        
        text_inputs = clip_processor(
            text=labels,
            return_tensors="pt",
            padding=True
        ).to(DEVICE)
        
        text_features = clip_model.get_text_features(**text_inputs)
        logits_per_image = image_features @ text_features.t()
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()[0]
        
        top_idx = np.argmax(probs)
        top_label = labels[top_idx]
        confidence = float(probs[top_idx])
        
    return {
        "detected_context": top_label,
        "confidence": confidence,
    }

def generate_beginner_guidance(
    user_query: str,
    screen_context: str,
    history: list
) -> str:
    """Generate beginner-friendly explanation using LLM."""
    
    # Build history text
    history_text = ""
    for i, msg in enumerate(history[-4:]):  # Last 4 messages
        if msg["role"] == "user":
            history_text += f"User: {msg['content']}\n"
        else:
            history_text += f"Assistant: {msg['content']}\n"
    
    # System prompt
    system_prompt = """You are an expert coding tutor teaching beginners. Your rules:

1. Explain like they've never coded before - define every term
2. Use analogies - relate coding concepts to real-world things
3. Break it down - never give full solutions, only next small step
4. Be encouraging - celebrate small wins
5. Use simple language - avoid jargon without explanation
6. Give code examples - show concrete examples when relevant

Current screen context: {context}
User's question: {query}

Provide a step-by-step explanation (2-3 short paragraphs maximum). Be friendly and encouraging."""

    prompt = system_prompt.format(context=screen_context, query=user_query)
    
    if history_text:
        prompt += f"\n\nPrevious conversation:\n{history_text}"
    
    # Generate
    messages = [{"role": "user", "content": prompt}]
    
    with torch.no_grad():
        text = llm_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        model_inputs = llm_tokenizer(
            text,
            return_tensors="pt",
            padding=True
        ).to(DEVICE)
        
        generated_ids = llm_model.generate(
            **model_inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=llm_tokenizer.eos_token_id
        )
        
        response = llm_tokenizer.decode(
            generated_ids[0][model_inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        )
    
    return response.strip()

def text_to_speech(text: str, speed: float = 1.0) -> str:
    """Convert text to speech using pyttsx3."""
    try:
        # Adjust speed
        rate = int(150 * speed)
        tts_engine.setProperty('rate', max(50, min(300, rate)))
        
        # Save to temporary file
        temp_file = "/tmp/speech.wav"
        tts_engine.save_to_file(text, temp_file)
        tts_engine.runAndWait()
        
        return temp_file
    except Exception as e:
        print(f"[ERROR] TTS failed: {e}")
        return None

# ============================================
# GRADIO INTERFACE
# ============================================

def coder_tutor(
    screenshot: Image.Image,
    user_query: str,
    speech_speed: float,
    history_json: str
):
    """Main tutor function."""
    
    if screenshot is None:
        return "❌ Please upload a screenshot", "", ""
    
    try:
        # Parse history
        try:
            history = json.loads(history_json) if history_json else []
        except:
            history = []
        
        # 1. Analyze screenshot
        print("[INFO] Analyzing screenshot...")
        analysis = analyze_screenshot_with_clip(screenshot)
        screen_context = analysis["detected_context"]
        
        # 2. Generate guidance
        print("[INFO] Generating guidance...")
        guidance = generate_beginner_guidance(
            user_query=user_query or "What should I do next?",
            screen_context=screen_context,
            history=history
        )
        
        # 3. Generate speech
        print("[INFO] Generating speech...")
        audio_file = text_to_speech(guidance, speed=speech_speed)
        
        # 4. Update history
        new_history = history + [
            {"role": "user", "content": user_query},
            {"role": "assistant", "content": guidance}
        ]
        
        return guidance, audio_file, json.dumps(new_history)
    
    except Exception as e:
        return f"❌ Error: {str(e)}", "", ""

# ============================================
# BUILD GRADIO INTERFACE
# ============================================

with gr.Blocks(title="Coder Tutor", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎓 Coder Tutor
    
    Real-time AI coaching for learning to code.
    
    **How to use:**
    1. 📸 Upload a screenshot of your screen
    2. ❓ Ask a question (e.g., "What's a function?")
    3. 🎧 Get explanation + hear audio guidance
    4. 🔄 Keep the conversation going with more questions
    """)
    
    with gr.Row():
        with gr.Column():
            # Inputs
            screenshot = gr.Image(
                label="📸 Screenshot",
                type="pil",
                scale=1
            )
            
            user_query = gr.Textbox(
                label="❓ Your Question",
                placeholder="E.g., 'What is a function?' or 'How do I fix this error?'",
                lines=2
            )
            
            speech_speed = gr.Slider(
                label="🎧 Speech Speed",
                minimum=0.5,
                maximum=2.0,
                value=1.0,
                step=0.1
            )
            
            submit_btn = gr.Button("🚀 Get Help", scale=2, variant="primary")
        
        with gr.Column():
            # Outputs
            guidance = gr.Textbox(
                label="💬 Guidance",
                lines=8,
                interactive=False
            )
            
            audio_output = gr.Audio(
                label="🔊 Listen to Explanation",
                type="filepath"
            )
            
            confidence = gr.Textbox(
                label="📊 Detected Context",
                interactive=False
            )
    
    # Hidden state for conversation history
    history_state = gr.State(value="[]")
    
    # Button click handler
    def on_submit(screenshot, query, speed, history_json):
        guidance, audio, new_history = coder_tutor(
            screenshot, query, speed, history_json
        )
        return guidance, audio, new_history
    
    submit_btn.click(
        on_submit,
        inputs=[screenshot, user_query, speech_speed, history_state],
        outputs=[guidance, audio_output, history_state]
    )
    
    gr.Markdown("""
    ---
    
    **Tips for Best Results:**
    - Be specific: "Explain for loops" works better than "help"
    - Include relevant code in your screenshot
    - Adjust speech speed for your learning pace
    - One concept at a time - master it before moving on
    """)

if __name__ == "__main__":
    demo.launch()