Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import base64 | |
| from PIL import Image | |
| import gradio as gr | |
| from openai import OpenAI | |
| from typing import Optional, Tuple | |
| def crop_image(image_path: str, top_left: Tuple[int, int], bottom_right: Tuple[int, int], output_path: Optional[str] = None) -> Image.Image: | |
| """ | |
| Crop an image using pixel coordinates. | |
| Args: | |
| image_path: Path to the input image file | |
| top_left: Tuple of (x, y) coordinates for the top-left corner of the crop area | |
| bottom_right: Tuple of (x, y) coordinates for the bottom-right corner of the crop area | |
| output_path: Optional path to save the cropped image. If None, returns the cropped image. | |
| Returns: | |
| PIL Image object of the cropped image | |
| Raises: | |
| FileNotFoundError: If the input image file doesn't exist | |
| ValueError: If coordinates are invalid | |
| """ | |
| # Validate input file exists | |
| if not os.path.exists(image_path): | |
| raise FileNotFoundError(f"Image file not found: {image_path}") | |
| # Open the image | |
| try: | |
| image = Image.open(image_path) | |
| except Exception as e: | |
| raise ValueError(f"Failed to open image: {e}") | |
| # Extract coordinates | |
| x1, y1 = top_left | |
| x2, y2 = bottom_right | |
| # Validate coordinates | |
| width, height = image.size | |
| if x1 < 0 or y1 < 0 or x2 > width or y2 > height: | |
| raise ValueError(f"Coordinates out of bounds. Image size: {width}x{height}, requested crop: ({x1},{y1}) to ({x2},{y2})") | |
| if x2 <= x1 or y2 <= y1: | |
| raise ValueError(f"Invalid crop dimensions. Bottom-right must be greater than top-left: ({x1},{y1}) to ({x2},{y2})") | |
| # PIL crop box format: (left, upper, right, lower) | |
| crop_box = (x1, y1, x2, y2) | |
| # Crop the image | |
| cropped_image = image.crop(crop_box) | |
| # Save if output path provided | |
| if output_path: | |
| try: | |
| cropped_image.save(output_path) | |
| print(f"Cropped image saved to: {output_path}") | |
| except Exception as e: | |
| raise ValueError(f"Failed to save cropped image: {e}") | |
| return cropped_image | |
| # Initialize OpenAI client with error handling | |
| def get_openai_client(): | |
| # Try multiple ways to get the API key | |
| api_key = None | |
| # Method 1: Environment variable (most common in Hugging Face Spaces) | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| # Method 2: Try reading from secrets file (Hugging Face Spaces) | |
| if not api_key: | |
| try: | |
| secrets_file = "/app/secrets.json" # Hugging Face Spaces secrets location | |
| if os.path.exists(secrets_file): | |
| import json | |
| with open(secrets_file, 'r') as f: | |
| secrets = json.load(f) | |
| api_key = secrets.get("OPENAI_API_KEY") | |
| except Exception as e: | |
| print(f"Could not read secrets file: {e}") | |
| # Method 3: For local development, try loading .env file | |
| if not api_key: | |
| try: | |
| from dotenv import load_dotenv | |
| load_dotenv() # Load .env file from current directory | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if api_key: | |
| print("Loaded API key from .env file (local development)") | |
| except ImportError: | |
| print("python-dotenv not installed, skipping .env file loading") | |
| except Exception as e: | |
| print(f"Could not load .env file: {e}") | |
| # Debug logging | |
| if api_key: | |
| print(f"Found API key (length: {len(api_key)})") | |
| else: | |
| print("No API key found in environment variables, secrets, or .env file") | |
| if not api_key: | |
| return None | |
| try: | |
| client = OpenAI(api_key=api_key.strip()) # Strip any whitespace | |
| print("OpenAI client initialized successfully") | |
| return client | |
| except Exception as e: | |
| print(f"Failed to initialize OpenAI client: {e}") | |
| return None | |
| client = get_openai_client() | |
| def process_image_with_gpt4(image_data: str, crop_region: Optional[Tuple[Tuple[int, int], Tuple[int, int]]] = None) -> str: | |
| """ | |
| Process image using GPT-4 vision API for OCR/text extraction | |
| Args: | |
| image_data: Base64 encoded image data | |
| crop_region: Optional tuple of ((x1, y1), (x2, y2)) coordinates to crop the image before OCR | |
| """ | |
| try: | |
| if not client: | |
| return "Error: OpenAI client not initialized - check API key" | |
| print(f"Processing image data of length: {len(image_data)}") | |
| # Handle data URL format (data:image/png;base64,...) | |
| if image_data.startswith('data:'): | |
| # Extract the base64 part after the comma | |
| if ',' in image_data: | |
| image_data = image_data.split(',')[1] | |
| else: | |
| return "Error: Invalid data URL format - no comma found" | |
| # Validate base64 | |
| try: | |
| image_bytes = base64.b64decode(image_data) | |
| print(f"Decoded image bytes length: {len(image_bytes)}") | |
| except Exception as decode_error: | |
| return f"Error: Failed to decode base64 - {str(decode_error)}" | |
| # Verify it's a valid image | |
| try: | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| print(f"Image validated: {image.size}, mode: {image.mode}") | |
| except Exception as image_error: | |
| return f"Error: Failed to validate image - {str(image_error)}" | |
| # Crop image if crop_region is specified | |
| if crop_region: | |
| try: | |
| top_left, bottom_right = crop_region | |
| print(f"Applying crop with coordinates: {top_left} to {bottom_right}") | |
| # PIL crop box format: (left, upper, right, lower) | |
| crop_box = (top_left[0], top_left[1], bottom_right[0], bottom_right[1]) | |
| # Validate crop coordinates | |
| width, height = image.size | |
| x1, y1, x2, y2 = crop_box | |
| if x1 < 0 or y1 < 0 or x2 > width or y2 > height: | |
| print(f"Warning: Crop coordinates partially out of bounds. Image size: {width}x{height}, requested crop: {crop_box}") | |
| # Adjust coordinates to stay within bounds | |
| x1 = max(0, min(x1, width)) | |
| y1 = max(0, min(y1, height)) | |
| x2 = max(0, min(x2, width)) | |
| y2 = max(0, min(y2, height)) | |
| crop_box = (x1, y1, x2, y2) | |
| if x2 > x1 and y2 > y1: | |
| original_size = image.size | |
| image = image.crop(crop_box) | |
| print(f"Image cropped from {original_size} to {image.size}") | |
| # Convert cropped image back to base64 | |
| buffered = io.BytesIO() | |
| image.save(buffered, format="PNG") | |
| image_data = base64.b64encode(buffered.getvalue()).decode() | |
| else: | |
| print(f"Warning: Invalid crop dimensions, skipping crop: {crop_box}") | |
| except Exception as crop_error: | |
| print(f"Warning: Failed to crop image: {crop_error}") | |
| # Continue with original image | |
| # Save image for local debugging if LOCAL_DEBUG is enabled | |
| is_local_debug = os.getenv("LOCAL_DEBUG", "false").lower() == "true" | |
| if is_local_debug: | |
| try: | |
| import datetime | |
| timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
| debug_filename = f"debug_input_{timestamp}.png" | |
| Image.open(io.BytesIO(base64.b64decode(image_data))).save(debug_filename) | |
| print(f"🐛 DEBUG: Saved input image as {debug_filename}") | |
| except Exception as save_error: | |
| print(f"Could not save debug image: {save_error}") | |
| # Process with GPT-4 Vision | |
| try: | |
| print("Calling GPT-4 Vision API...") | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/png;base64,{image_data}" | |
| } | |
| }, | |
| { | |
| "type": "text", | |
| "text": "Extract all text from this image. Include every word, number, symbol, and mathematical notation. If there are multiple questions or sections, preserve the structure. Use LaTeX for mathematical expressions." | |
| } | |
| ] | |
| } | |
| ], | |
| max_tokens=1000, | |
| temperature=0.3 | |
| ) | |
| extracted_text = response.choices[0].message.content.strip() | |
| print(f"GPT-4 Vision extraction completed. Text length: {len(extracted_text)}") | |
| if is_local_debug: | |
| print(f"🐛 DEBUG: Extracted text: '{extracted_text}'") | |
| return extracted_text | |
| except Exception as ocr_error: | |
| error_msg = f"Error: GPT-4 Vision processing failed - {str(ocr_error)}" | |
| print(f"❌ {error_msg}") | |
| import traceback | |
| print(f"❌ Full traceback: {traceback.format_exc()}") | |
| return error_msg | |
| except Exception as e: | |
| return f"Error: Unexpected error - {str(e)}" | |
| def compare_ocr_with_crop(image_data: str, crop_region: Tuple[Tuple[int, int], Tuple[int, int]]) -> dict: | |
| """ | |
| Compare OCR results between cropped and uncropped versions of an image using GPT-4 Vision | |
| Args: | |
| image_data: Base64 encoded image data | |
| crop_region: Tuple of ((x1, y1), (x2, y2)) coordinates for cropping | |
| Returns: | |
| Dictionary containing comparison results | |
| """ | |
| print("🔍 Starting OCR comparison: cropped vs uncropped (GPT-4 Vision)") | |
| print(f"📏 Crop region: {crop_region[0]} to {crop_region[1]}") | |
| # Process uncropped image | |
| print("\n📄 Processing uncropped image...") | |
| uncropped_result = process_image_with_gpt4(image_data, crop_region=None) | |
| # Process cropped image | |
| print("\n✂️ Processing cropped image...") | |
| cropped_result = process_image_with_gpt4(image_data, crop_region=crop_region) | |
| # Analyze results | |
| results = { | |
| "crop_region": crop_region, | |
| "uncropped": { | |
| "text": uncropped_result, | |
| "length": len(uncropped_result), | |
| "is_error": uncropped_result.startswith("Error") | |
| }, | |
| "cropped": { | |
| "text": cropped_result, | |
| "length": len(cropped_result), | |
| "is_error": cropped_result.startswith("Error") | |
| }, | |
| "comparison": { | |
| "texts_match": uncropped_result == cropped_result, | |
| "length_difference": len(cropped_result) - len(uncropped_result), | |
| "cropped_has_more_text": len(cropped_result) > len(uncropped_result), | |
| "both_successful": not (uncropped_result.startswith("Error") or cropped_result.startswith("Error")) | |
| } | |
| } | |
| # Print comparison summary | |
| print("\n📊 OCR Comparison Results:") | |
| print(f"Uncropped text length: {results['uncropped']['length']} characters") | |
| print(f"Cropped text length: {results['cropped']['length']} characters") | |
| print(f"Length difference: {results['comparison']['length_difference']} characters") | |
| print(f"Texts match exactly: {results['comparison']['texts_match']}") | |
| print(f"Both OCR operations successful: {results['comparison']['both_successful']}") | |
| if results['comparison']['both_successful']: | |
| if results['comparison']['texts_match']: | |
| print("✅ Cropped and uncropped images produced identical OCR results") | |
| elif results['comparison']['cropped_has_more_text']: | |
| print("📈 Cropped image produced more text than uncropped image") | |
| else: | |
| print("📉 Cropped image produced less text than uncropped image") | |
| else: | |
| print("❌ One or both OCR operations failed") | |
| return results | |
| def get_gpt_answer(question: str) -> str: | |
| """ | |
| Send the extracted question to GPT and get an answer | |
| """ | |
| try: | |
| if not client: | |
| return "Error: OpenAI client not initialized - check API key configuration" | |
| if not hasattr(client, 'api_key') or not client.api_key: | |
| return "Error: OpenAI API key not configured" | |
| response = client.chat.completions.create( | |
| model="gpt-4", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a helpful assistant that provides concise, accurate answers to academic questions. Keep responses brief but informative." | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"Please answer this question concisely: {question}" | |
| } | |
| ], | |
| max_tokens=500, | |
| temperature=0.3 | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def process_screenshot_with_crop_comparison(image_data: str, crop_region: Tuple[Tuple[int, int], Tuple[int, int]]) -> dict: | |
| """ | |
| Process screenshot with crop comparison - extracts text from both cropped and uncropped versions | |
| and gets GPT answers for both, then compares the results. | |
| Args: | |
| image_data: Base64 encoded image data | |
| crop_region: Tuple of ((x1, y1), (x2, y2)) coordinates for cropping | |
| Returns: | |
| Dictionary containing OCR and GPT results for both versions | |
| """ | |
| print("🎯 Starting screenshot processing with crop comparison (GPT-4 Vision)") | |
| if not image_data: | |
| return {"error": "No image data provided"} | |
| # Get OCR comparison results | |
| ocr_comparison = compare_ocr_with_crop(image_data, crop_region) | |
| # Get GPT answers for both versions (if OCR was successful) | |
| gpt_answers = {} | |
| if not ocr_comparison["uncropped"]["is_error"]: | |
| print("\n🤖 Getting GPT answer for uncropped text...") | |
| gpt_answers["uncropped"] = get_gpt_answer(ocr_comparison["uncropped"]["text"]) | |
| if not ocr_comparison["cropped"]["is_error"]: | |
| print("\n🤖 Getting GPT answer for cropped text...") | |
| gpt_answers["cropped"] = get_gpt_answer(ocr_comparison["cropped"]["text"]) | |
| # Build final result | |
| result = { | |
| "crop_region": crop_region, | |
| "ocr_comparison": ocr_comparison, | |
| "gpt_answers": gpt_answers, | |
| "success": True | |
| } | |
| # Add comparison insights | |
| if "uncropped" in gpt_answers and "cropped" in gpt_answers: | |
| answers_match = gpt_answers["uncropped"] == gpt_answers["cropped"] | |
| result["comparison"] = { | |
| "gpt_answers_match": answers_match, | |
| "ocr_texts_match": ocr_comparison["comparison"]["texts_match"] | |
| } | |
| print(f"\n🎯 Final Comparison:") | |
| print(f"OCR texts match: {ocr_comparison['comparison']['texts_match']}") | |
| print(f"GPT answers match: {answers_match}") | |
| if answers_match: | |
| print("✅ Cropped and uncropped processing produced identical results") | |
| else: | |
| print("⚠️ Cropped and uncropped processing produced different results") | |
| return result | |
| def process_screenshot(image_data: str) -> dict: | |
| """ | |
| Main processing function that handles the entire pipeline | |
| """ | |
| if not image_data: | |
| return {"error": "No image data provided"} | |
| # Extract text from image using GPT-4 Vision | |
| extracted_text = process_image_with_gpt4(image_data) | |
| if extracted_text.startswith("Error"): | |
| return {"error": extracted_text} | |
| # Get GPT answer | |
| gpt_answer = get_gpt_answer(extracted_text) | |
| return { | |
| "extracted_text": extracted_text, | |
| "answer": gpt_answer, | |
| "success": True | |
| } | |
| # Gradio interface for testing | |
| def gradio_interface(image): | |
| """ | |
| Gradio interface for testing the GPT-4 Vision + GPT pipeline | |
| """ | |
| if image is None: | |
| return "No image provided", "No answer available" | |
| # Convert PIL image to base64 | |
| buffered = io.BytesIO() | |
| image.save(buffered, format="PNG") | |
| img_str = base64.b64encode(buffered.getvalue()).decode() | |
| result = process_screenshot(img_str) | |
| if "error" in result: | |
| return result["error"], "Error occurred" | |
| return result["extracted_text"], result["answer"] | |
| # Create Gradio app | |
| demo = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=gr.Image(type="pil", label="Upload Screenshot"), | |
| outputs=[ | |
| gr.Textbox(label="Extracted Text", lines=3), | |
| gr.Textbox(label="GPT Answer", lines=5) | |
| ], | |
| title="Educational Question Solver", | |
| description="Upload a screenshot of a question and get an AI-powered answer for educational purposes.", | |
| examples=None | |
| ) | |
| # FastAPI setup for proper API endpoints | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| import uvicorn | |
| app = FastAPI(title="Educational Question Solver API") | |
| # Add CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| class ImageRequest(BaseModel): | |
| image_data: str | |
| async def process_image_endpoint(request: ImageRequest): | |
| """ | |
| API endpoint that the Electron app will call | |
| """ | |
| try: | |
| result = process_screenshot(request.image_data) | |
| return result | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def root(): | |
| return {"message": "Educational Question Solver API is running"} | |
| async def health_check(): | |
| return { | |
| "status": "healthy", | |
| "openai_client": client is not None, | |
| "ocr_model": "gpt-4o-vision", | |
| "local_debug": os.getenv("LOCAL_DEBUG", "false").lower() == "true", | |
| "openai_api_key_length": len(os.getenv("OPENAI_API_KEY", "")) if os.getenv("OPENAI_API_KEY") else 0 | |
| } | |
| async def test_openai(): | |
| """Test endpoint to check OpenAI functionality""" | |
| if not client: | |
| return {"error": "OpenAI client not initialized"} | |
| try: | |
| # Simple test call | |
| response = client.chat.completions.create( | |
| model="gpt-3.5-turbo", | |
| messages=[{"role": "user", "content": "Say 'Hello'"}], | |
| max_tokens=10 | |
| ) | |
| return { | |
| "success": True, | |
| "response": response.choices[0].message.content.strip() | |
| } | |
| except Exception as e: | |
| return {"error": f"OpenAI test failed: {str(e)}"} | |
| async def test_vision(): | |
| """Test endpoint to check GPT-4 Vision functionality""" | |
| if not client: | |
| return {"error": "OpenAI client not initialized"} | |
| try: | |
| # Create a simple test image | |
| from PIL import Image, ImageDraw | |
| test_image = Image.new('RGB', (200, 100), color='white') | |
| draw = ImageDraw.Draw(test_image) | |
| draw.text((10, 10), "TEST IMAGE", fill='black') | |
| # Convert to base64 | |
| buffered = io.BytesIO() | |
| test_image.save(buffered, format="PNG") | |
| img_str = base64.b64encode(buffered.getvalue()).decode() | |
| # Test with GPT-4 Vision | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/png;base64,{img_str}" | |
| } | |
| }, | |
| { | |
| "type": "text", | |
| "text": "What text do you see in this image?" | |
| } | |
| ] | |
| } | |
| ], | |
| max_tokens=100 | |
| ) | |
| return { | |
| "success": True, | |
| "response": response.choices[0].message.content.strip() | |
| } | |
| except Exception as e: | |
| return {"error": f"Vision test failed: {str(e)}"} | |
| # Mount Gradio app | |
| app = gr.mount_gradio_app(app, demo, path="/gradio") | |
| if __name__ == "__main__": | |
| # For local testing | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |