Spaces:
Build error
Build error
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from PIL import Image | |
| import base64 | |
| import io | |
| from transformers import ( | |
| CLIPProcessor, CLIPModel, | |
| AutoTokenizer, AutoModelForCausalLM, | |
| ) | |
| import pyttsx3 | |
| import json | |
| from pathlib import Path | |
| # ============================================ | |
| # CONFIGURATION | |
| # ============================================ | |
| DEVICE = "cpu" | |
| TORCH_DTYPE = torch.float32 | |
| # Model names (CPU-optimized) | |
| CLIP_MODEL_NAME = "openai/clip-vit-base-patch32" | |
| LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" | |
| # ============================================ | |
| # INITIALIZE MODELS (Global, loaded once) | |
| # ============================================ | |
| print("[INFO] Loading CLIP model...") | |
| clip_model = CLIPModel.from_pretrained( | |
| CLIP_MODEL_NAME, | |
| torch_dtype=TORCH_DTYPE, | |
| device_map=DEVICE | |
| ).to(DEVICE).eval() | |
| clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME) | |
| print("[INFO] Loading LLM (Qwen2.5-1.5B)...") | |
| llm_tokenizer = AutoTokenizer.from_pretrained( | |
| LLM_MODEL_NAME, | |
| trust_remote_code=True | |
| ) | |
| llm_model = AutoModelForCausalLM.from_pretrained( | |
| LLM_MODEL_NAME, | |
| torch_dtype=TORCH_DTYPE, | |
| device_map=DEVICE, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True | |
| ).to(DEVICE).eval() | |
| print("[INFO] Initializing TTS...") | |
| tts_engine = pyttsx3.init() | |
| tts_engine.setProperty('rate', 150) # Speech rate | |
| # ============================================ | |
| # HELPER FUNCTIONS | |
| # ============================================ | |
| def analyze_screenshot_with_clip(image: Image.Image) -> dict: | |
| """Use CLIP to understand what's on the screen.""" | |
| with torch.no_grad(): | |
| # Resize for faster processing | |
| image = image.resize((224, 224), Image.Resampling.LANCZOS) | |
| inputs = clip_processor( | |
| images=image, | |
| return_tensors="pt", | |
| padding=True | |
| ).to(DEVICE) | |
| image_features = clip_model.get_image_features(**inputs) | |
| # Classify what's on screen | |
| labels = [ | |
| "Python code editor", | |
| "JavaScript code", | |
| "HTML/CSS markup", | |
| "Terminal/console output", | |
| "Error message", | |
| "Browser DevTools", | |
| "IDE or text editor", | |
| "File explorer", | |
| "Command line", | |
| "Documentation page" | |
| ] | |
| text_inputs = clip_processor( | |
| text=labels, | |
| return_tensors="pt", | |
| padding=True | |
| ).to(DEVICE) | |
| text_features = clip_model.get_text_features(**text_inputs) | |
| logits_per_image = image_features @ text_features.t() | |
| probs = logits_per_image.softmax(dim=-1).cpu().numpy()[0] | |
| top_idx = np.argmax(probs) | |
| top_label = labels[top_idx] | |
| confidence = float(probs[top_idx]) | |
| return { | |
| "detected_context": top_label, | |
| "confidence": confidence, | |
| } | |
| def generate_beginner_guidance( | |
| user_query: str, | |
| screen_context: str, | |
| history: list | |
| ) -> str: | |
| """Generate beginner-friendly explanation using LLM.""" | |
| # Build history text | |
| history_text = "" | |
| for i, msg in enumerate(history[-4:]): # Last 4 messages | |
| if msg["role"] == "user": | |
| history_text += f"User: {msg['content']}\n" | |
| else: | |
| history_text += f"Assistant: {msg['content']}\n" | |
| # System prompt | |
| system_prompt = """You are an expert coding tutor teaching beginners. Your rules: | |
| 1. Explain like they've never coded before - define every term | |
| 2. Use analogies - relate coding concepts to real-world things | |
| 3. Break it down - never give full solutions, only next small step | |
| 4. Be encouraging - celebrate small wins | |
| 5. Use simple language - avoid jargon without explanation | |
| 6. Give code examples - show concrete examples when relevant | |
| Current screen context: {context} | |
| User's question: {query} | |
| Provide a step-by-step explanation (2-3 short paragraphs maximum). Be friendly and encouraging.""" | |
| prompt = system_prompt.format(context=screen_context, query=user_query) | |
| if history_text: | |
| prompt += f"\n\nPrevious conversation:\n{history_text}" | |
| # Generate | |
| messages = [{"role": "user", "content": prompt}] | |
| with torch.no_grad(): | |
| text = llm_tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| model_inputs = llm_tokenizer( | |
| text, | |
| return_tensors="pt", | |
| padding=True | |
| ).to(DEVICE) | |
| generated_ids = llm_model.generate( | |
| **model_inputs, | |
| max_new_tokens=200, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| pad_token_id=llm_tokenizer.eos_token_id | |
| ) | |
| response = llm_tokenizer.decode( | |
| generated_ids[0][model_inputs.input_ids.shape[1]:], | |
| skip_special_tokens=True | |
| ) | |
| return response.strip() | |
| def text_to_speech(text: str, speed: float = 1.0) -> str: | |
| """Convert text to speech using pyttsx3.""" | |
| try: | |
| # Adjust speed | |
| rate = int(150 * speed) | |
| tts_engine.setProperty('rate', max(50, min(300, rate))) | |
| # Save to temporary file | |
| temp_file = "/tmp/speech.wav" | |
| tts_engine.save_to_file(text, temp_file) | |
| tts_engine.runAndWait() | |
| return temp_file | |
| except Exception as e: | |
| print(f"[ERROR] TTS failed: {e}") | |
| return None | |
| # ============================================ | |
| # GRADIO INTERFACE | |
| # ============================================ | |
| def coder_tutor( | |
| screenshot: Image.Image, | |
| user_query: str, | |
| speech_speed: float, | |
| history_json: str | |
| ): | |
| """Main tutor function.""" | |
| if screenshot is None: | |
| return "β Please upload a screenshot", "", "" | |
| try: | |
| # Parse history | |
| try: | |
| history = json.loads(history_json) if history_json else [] | |
| except: | |
| history = [] | |
| # 1. Analyze screenshot | |
| print("[INFO] Analyzing screenshot...") | |
| analysis = analyze_screenshot_with_clip(screenshot) | |
| screen_context = analysis["detected_context"] | |
| # 2. Generate guidance | |
| print("[INFO] Generating guidance...") | |
| guidance = generate_beginner_guidance( | |
| user_query=user_query or "What should I do next?", | |
| screen_context=screen_context, | |
| history=history | |
| ) | |
| # 3. Generate speech | |
| print("[INFO] Generating speech...") | |
| audio_file = text_to_speech(guidance, speed=speech_speed) | |
| # 4. Update history | |
| new_history = history + [ | |
| {"role": "user", "content": user_query}, | |
| {"role": "assistant", "content": guidance} | |
| ] | |
| return guidance, audio_file, json.dumps(new_history) | |
| except Exception as e: | |
| return f"β Error: {str(e)}", "", "" | |
| # ============================================ | |
| # BUILD GRADIO INTERFACE | |
| # ============================================ | |
| with gr.Blocks(title="Coder Tutor", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π Coder Tutor | |
| Real-time AI coaching for learning to code. | |
| **How to use:** | |
| 1. πΈ Upload a screenshot of your screen | |
| 2. β Ask a question (e.g., "What's a function?") | |
| 3. π§ Get explanation + hear audio guidance | |
| 4. π Keep the conversation going with more questions | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Inputs | |
| screenshot = gr.Image( | |
| label="πΈ Screenshot", | |
| type="pil", | |
| scale=1 | |
| ) | |
| user_query = gr.Textbox( | |
| label="β Your Question", | |
| placeholder="E.g., 'What is a function?' or 'How do I fix this error?'", | |
| lines=2 | |
| ) | |
| speech_speed = gr.Slider( | |
| label="π§ Speech Speed", | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1 | |
| ) | |
| submit_btn = gr.Button("π Get Help", scale=2, variant="primary") | |
| with gr.Column(): | |
| # Outputs | |
| guidance = gr.Textbox( | |
| label="π¬ Guidance", | |
| lines=8, | |
| interactive=False | |
| ) | |
| audio_output = gr.Audio( | |
| label="π Listen to Explanation", | |
| type="filepath" | |
| ) | |
| confidence = gr.Textbox( | |
| label="π Detected Context", | |
| interactive=False | |
| ) | |
| # Hidden state for conversation history | |
| history_state = gr.State(value="[]") | |
| # Button click handler | |
| def on_submit(screenshot, query, speed, history_json): | |
| guidance, audio, new_history = coder_tutor( | |
| screenshot, query, speed, history_json | |
| ) | |
| return guidance, audio, new_history | |
| submit_btn.click( | |
| on_submit, | |
| inputs=[screenshot, user_query, speech_speed, history_state], | |
| outputs=[guidance, audio_output, history_state] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **Tips for Best Results:** | |
| - Be specific: "Explain for loops" works better than "help" | |
| - Include relevant code in your screenshot | |
| - Adjust speech speed for your learning pace | |
| - One concept at a time - master it before moving on | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |