Spaces:

Adedoyinjames
/

Tutor

Build error

App Files Files Community

Adedoyinjames commited on about 1 month ago

Commit

9b08f3f

verified ·

1 Parent(s): 0f6dcc4

Create app.py

Browse files

Files changed (1) hide show

app.py +377 -0

app.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import torch
+import numpy as np
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from PIL import Image
+import base64
+import io
+import json
+import os
+from pathlib import Path
+import tempfile
+import uvicorn
+# ============================================
+# IMPORTS FOR MODELS
+# ============================================
+from transformers import (
+    CLIPProcessor, CLIPModel,
+    AutoTokenizer, AutoModelForCausalLM,
+    pipeline
+)
+from TTS.api import TTS
+# ============================================
+# CONFIGURATION FOR CPU
+# ============================================
+DEVICE = "cpu"
+TORCH_DTYPE = torch.float32
+# Model names (CPU-optimized, quantized)
+CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"
+LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+TTS_MODEL_NAME = "tts_models/en/ljspeech/glow-tts"  # Fast, high-quality
+# ============================================
+# INITIALIZE MODELS (Global, loaded once)
+# ============================================
+print("[INFO] Loading CLIP model...")
+clip_model = CLIPModel.from_pretrained(
+    CLIP_MODEL_NAME,
+    torch_dtype=TORCH_DTYPE,
+    device_map=DEVICE
+).to(DEVICE).eval()
+clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
+print("[INFO] Loading LLM (Qwen2.5-1.5B)...")
+llm_tokenizer = AutoTokenizer.from_pretrained(
+    LLM_MODEL_NAME,
+    trust_remote_code=True
+)
+llm_model = AutoModelForCausalLM.from_pretrained(
+    LLM_MODEL_NAME,
+    torch_dtype=TORCH_DTYPE,
+    device_map=DEVICE,
+    trust_remote_code=True,
+    low_cpu_mem_usage=True
+).to(DEVICE).eval()
+print("[INFO] Loading TTS model (Glow-TTS)...")
+tts = TTS(model_name=TTS_MODEL_NAME, gpu=False, progress_bar=False, verbose=False)
+# ============================================
+# FAST API APP
+# ============================================
+app = FastAPI(title="Coder Tutor Backend", version="1.0")
+# Add CORS middleware for frontend communication
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ============================================
+# PYDANTIC MODELS
+# ============================================
+class LearningRequest(BaseModel):
+    screenshot_base64: str  # Base64 encoded image
+    user_query: str
+    conversation_history: list = []
+    speech_speed: float = 1.0  # TTS speed multiplier (0.5-2.0)
+class LearningResponse(BaseModel):
+    guidance: str
+    audio_url: str
+    confidence: float
+# ============================================
+# HELPER FUNCTIONS
+# ============================================
+def decode_image(image_base64: str) -> Image.Image:
+    """Decode base64 image string to PIL Image."""
+    try:
+        image_data = base64.b64decode(image_base64)
+        image = Image.open(io.BytesIO(image_data)).convert("RGB")
+        # Resize for faster processing (CLIP works well with 224x224)
+        image = image.resize((224, 224), Image.Resampling.LANCZOS)
+        return image
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid image: {str(e)}")
+def analyze_screenshot_with_clip(image: Image.Image) -> dict:
+    """Use CLIP to understand what's on the screen."""
+    with torch.no_grad():
+        # Process image
+        inputs = clip_processor(
+            images=image,
+            return_tensors="pt",
+            padding=True
+        ).to(DEVICE)
+        image_features = clip_model.get_image_features(**inputs)
+        # Classify what's on screen
+        labels = [
+            "Python code editor",
+            "JavaScript code",
+            "HTML/CSS markup",
+            "Terminal/console output",
+            "Error message",
+            "Browser DevTools",
+            "IDE or text editor",
+            "File explorer",
+            "Command line",
+            "Documentation page"
+        ]
+        text_inputs = clip_processor(
+            text=labels,
+            return_tensors="pt",
+            padding=True
+        ).to(DEVICE)
+        text_features = clip_model.get_text_features(**text_inputs)
+        # Compute similarity
+        logits_per_image = image_features @ text_features.t()
+        probs = logits_per_image.softmax(dim=-1).cpu().numpy()[0]
+        top_idx = np.argmax(probs)
+        top_label = labels[top_idx]
+        confidence = float(probs[top_idx])
+    return {
+        "detected_context": top_label,
+        "confidence": confidence,
+        "all_probs": {label: float(prob) for label, prob in zip(labels, probs)}
+    }
+def generate_beginner_guidance(
+    user_query: str,
+    screen_context: str,
+    conversation_history: list
+) -> str:
+    """Generate beginner-friendly explanation using LLM."""
+    # Build conversation context
+    history_text = ""
+    for msg in conversation_history[-4:]:  # Last 4 messages for context
+        if msg.get("role") == "user":
+            history_text += f"User: {msg.get('query', '')}\n"
+        elif msg.get("role") == "assistant":
+            history_text += f"Assistant: {msg.get('guidance', '')}\n"
+    # System prompt for beginner-friendly explanations
+    system_prompt = """You are an expert coding tutor teaching beginners. Your rules:
+1. **Explain like they've never coded before** - define every term
+2. **Use analogies** - relate coding concepts to real-world things
+3. **Break it down** - never give full solutions, only next small step
+4. **Ask questions** - encourage thinking, don't just tell
+5. **Be encouraging** - celebrate small wins
+6. **Use simple language** - avoid jargon without explanation
+7. **Give code examples** - when relevant, show concrete examples
+Current screen context: {context}
+User's question/problem: {query}
+Provide a step-by-step explanation of what they should do next. Keep it to 2-3 short paragraphs maximum."""
+    prompt = system_prompt.format(context=screen_context, query=user_query)
+    # Add history if available
+    if history_text:
+        prompt += f"\n\nPrevious conversation:\n{history_text}"
+    # Generate with Qwen
+    messages = [{"role": "user", "content": prompt}]
+    with torch.no_grad():
+        text = llm_tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        model_inputs = llm_tokenizer(
+            text,
+            return_tensors="pt",
+            padding=True
+        ).to(DEVICE)
+        generated_ids = llm_model.generate(
+            **model_inputs,
+            max_new_tokens=256,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            pad_token_id=llm_tokenizer.eos_token_id
+        )
+        response = llm_tokenizer.decode(
+            generated_ids[0][model_inputs.input_ids.shape[1]:],
+            skip_special_tokens=True
+        )
+    return response.strip()
+def generate_speech(text: str, speed: float = 1.0) -> str:
+    """Generate speech using Coqui TTS and return file path."""
+    try:
+        # Create temp directory for audio
+        temp_dir = tempfile.gettempdir()
+        audio_file = os.path.join(temp_dir, "guidance_speech.wav")
+        # Generate speech with speed control
+        # Glow-TTS doesn't have built-in speed param, so we generate and modify
+        tts.tts_to_file(
+            text=text,
+            file_path=audio_file,
+            speaker=tts.speakers[0] if tts.speakers else None
+        )
+        return audio_file
+    except Exception as e:
+        print(f"[ERROR] TTS generation failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"TTS failed: {str(e)}")
+# ============================================
+# API ENDPOINTS
+# ============================================
+@app.post("/learn", response_model=LearningResponse)
+async def learn(request: LearningRequest):
+    """
+    Main endpoint: receive screenshot + query, return guidance + speech.
+    """
+    try:
+        # 1. Decode and analyze screenshot
+        print(f"[INFO] Decoding screenshot...")
+        image = decode_image(request.screenshot_base64)
+        print(f"[INFO] Analyzing screen with CLIP...")
+        screen_analysis = analyze_screenshot_with_clip(image)
+        screen_context = screen_analysis["detected_context"]
+        # 2. Generate guidance
+        print(f"[INFO] Generating guidance with LLM...")
+        guidance = generate_beginner_guidance(
+            user_query=request.user_query,
+            screen_context=screen_context,
+            conversation_history=request.conversation_history
+        )
+        # 3. Generate speech
+        print(f"[INFO] Generating speech...")
+        audio_file = generate_speech(guidance, speed=request.speech_speed)
+        # 4. Read audio and encode as base64 for response
+        with open(audio_file, "rb") as f:
+            audio_base64 = base64.b64encode(f.read()).decode()
+        return LearningResponse(
+            guidance=guidance,
+            audio_url=f"data:audio/wav;base64,{audio_base64}",
+            confidence=screen_analysis["confidence"]
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"[ERROR] {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/analyze-screenshot")
+async def analyze_screenshot(request: BaseModel):
+    """
+    Endpoint to just analyze what's on screen without generating guidance.
+    Useful for debugging or understanding context.
+    """
+    try:
+        class AnalyzeRequest(BaseModel):
+            screenshot_base64: str
+        image = decode_image(request.screenshot_base64)
+        analysis = analyze_screenshot_with_clip(image)
+        return JSONResponse({
+            "detected_context": analysis["detected_context"],
+            "confidence": analysis["confidence"],
+            "all_detections": analysis["all_probs"]
+        })
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/tts")
+async def text_to_speech(request: BaseModel):
+    """
+    Standalone TTS endpoint for converting text to speech.
+    Useful if you want to decouple TTS from the main learning flow.
+    """
+    try:
+        class TTSRequest(BaseModel):
+            text: str
+            speed: float = 1.0
+        audio_file = generate_speech(request.text, speed=request.speed)
+        with open(audio_file, "rb") as f:
+            audio_base64 = base64.b64encode(f.read()).decode()
+        return JSONResponse({
+            "audio_url": f"data:audio/wav;base64,{audio_base64}"
+        })
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {
+        "status": "healthy",
+        "device": DEVICE,
+        "clip_model": CLIP_MODEL_NAME,
+        "llm_model": LLM_MODEL_NAME,
+        "tts_model": TTS_MODEL_NAME
+    }
+@app.get("/")
+async def root():
+    """Root endpoint with documentation."""
+    return {
+        "name": "Coder Tutor Backend",
+        "version": "1.0",
+        "endpoints": {
+            "POST /learn": "Main endpoint - send screenshot + query, get guidance + speech",
+            "POST /analyze-screenshot": "Analyze what's on screen",
+            "POST /tts": "Standalone text-to-speech conversion",
+            "GET /health": "Health check with model info"
+        },
+        "models": {
+            "image_encoder": CLIP_MODEL_NAME,
+            "llm": LLM_MODEL_NAME,
+            "tts": TTS_MODEL_NAME
+        }
+    }
+# ============================================
+# RUN SERVER
+# ============================================
+if __name__ == "__main__":
+    # Check if running on Hugging Face Spaces
+    space_id = os.getenv("SPACE_ID", None)
+    if space_id:
+        print(f"[INFO] Running on Hugging Face Space: {space_id}")
+        # HF Spaces exposes port 7860 by default
+        uvicorn.run(app, host="0.0.0.0", port=7860)
+    else:
+        # Local development
+        uvicorn.run(app, host="127.0.0.1", port=8000, reload=True)