import os import io import base64 from PIL import Image import gradio as gr from openai import OpenAI from typing import Optional, Tuple def crop_image(image_path: str, top_left: Tuple[int, int], bottom_right: Tuple[int, int], output_path: Optional[str] = None) -> Image.Image: """ Crop an image using pixel coordinates. Args: image_path: Path to the input image file top_left: Tuple of (x, y) coordinates for the top-left corner of the crop area bottom_right: Tuple of (x, y) coordinates for the bottom-right corner of the crop area output_path: Optional path to save the cropped image. If None, returns the cropped image. Returns: PIL Image object of the cropped image Raises: FileNotFoundError: If the input image file doesn't exist ValueError: If coordinates are invalid """ # Validate input file exists if not os.path.exists(image_path): raise FileNotFoundError(f"Image file not found: {image_path}") # Open the image try: image = Image.open(image_path) except Exception as e: raise ValueError(f"Failed to open image: {e}") # Extract coordinates x1, y1 = top_left x2, y2 = bottom_right # Validate coordinates width, height = image.size if x1 < 0 or y1 < 0 or x2 > width or y2 > height: raise ValueError(f"Coordinates out of bounds. Image size: {width}x{height}, requested crop: ({x1},{y1}) to ({x2},{y2})") if x2 <= x1 or y2 <= y1: raise ValueError(f"Invalid crop dimensions. Bottom-right must be greater than top-left: ({x1},{y1}) to ({x2},{y2})") # PIL crop box format: (left, upper, right, lower) crop_box = (x1, y1, x2, y2) # Crop the image cropped_image = image.crop(crop_box) # Save if output path provided if output_path: try: cropped_image.save(output_path) print(f"Cropped image saved to: {output_path}") except Exception as e: raise ValueError(f"Failed to save cropped image: {e}") return cropped_image # Initialize OpenAI client with error handling def get_openai_client(): # Try multiple ways to get the API key api_key = None # Method 1: Environment variable (most common in Hugging Face Spaces) api_key = os.getenv("OPENAI_API_KEY") # Method 2: Try reading from secrets file (Hugging Face Spaces) if not api_key: try: secrets_file = "/app/secrets.json" # Hugging Face Spaces secrets location if os.path.exists(secrets_file): import json with open(secrets_file, 'r') as f: secrets = json.load(f) api_key = secrets.get("OPENAI_API_KEY") except Exception as e: print(f"Could not read secrets file: {e}") # Method 3: For local development, try loading .env file if not api_key: try: from dotenv import load_dotenv load_dotenv() # Load .env file from current directory api_key = os.getenv("OPENAI_API_KEY") if api_key: print("Loaded API key from .env file (local development)") except ImportError: print("python-dotenv not installed, skipping .env file loading") except Exception as e: print(f"Could not load .env file: {e}") # Debug logging if api_key: print(f"Found API key (length: {len(api_key)})") else: print("No API key found in environment variables, secrets, or .env file") if not api_key: return None try: client = OpenAI(api_key=api_key.strip()) # Strip any whitespace print("OpenAI client initialized successfully") return client except Exception as e: print(f"Failed to initialize OpenAI client: {e}") return None client = get_openai_client() def process_image_with_gpt4(image_data: str, crop_region: Optional[Tuple[Tuple[int, int], Tuple[int, int]]] = None) -> str: """ Process image using GPT-4 vision API for OCR/text extraction Args: image_data: Base64 encoded image data crop_region: Optional tuple of ((x1, y1), (x2, y2)) coordinates to crop the image before OCR """ try: if not client: return "Error: OpenAI client not initialized - check API key" print(f"Processing image data of length: {len(image_data)}") # Handle data URL format (data:image/png;base64,...) if image_data.startswith('data:'): # Extract the base64 part after the comma if ',' in image_data: image_data = image_data.split(',')[1] else: return "Error: Invalid data URL format - no comma found" # Validate base64 try: image_bytes = base64.b64decode(image_data) print(f"Decoded image bytes length: {len(image_bytes)}") except Exception as decode_error: return f"Error: Failed to decode base64 - {str(decode_error)}" # Verify it's a valid image try: image = Image.open(io.BytesIO(image_bytes)) print(f"Image validated: {image.size}, mode: {image.mode}") except Exception as image_error: return f"Error: Failed to validate image - {str(image_error)}" # Crop image if crop_region is specified if crop_region: try: top_left, bottom_right = crop_region print(f"Applying crop with coordinates: {top_left} to {bottom_right}") # PIL crop box format: (left, upper, right, lower) crop_box = (top_left[0], top_left[1], bottom_right[0], bottom_right[1]) # Validate crop coordinates width, height = image.size x1, y1, x2, y2 = crop_box if x1 < 0 or y1 < 0 or x2 > width or y2 > height: print(f"Warning: Crop coordinates partially out of bounds. Image size: {width}x{height}, requested crop: {crop_box}") # Adjust coordinates to stay within bounds x1 = max(0, min(x1, width)) y1 = max(0, min(y1, height)) x2 = max(0, min(x2, width)) y2 = max(0, min(y2, height)) crop_box = (x1, y1, x2, y2) if x2 > x1 and y2 > y1: original_size = image.size image = image.crop(crop_box) print(f"Image cropped from {original_size} to {image.size}") # Convert cropped image back to base64 buffered = io.BytesIO() image.save(buffered, format="PNG") image_data = base64.b64encode(buffered.getvalue()).decode() else: print(f"Warning: Invalid crop dimensions, skipping crop: {crop_box}") except Exception as crop_error: print(f"Warning: Failed to crop image: {crop_error}") # Continue with original image # Save image for local debugging if LOCAL_DEBUG is enabled is_local_debug = os.getenv("LOCAL_DEBUG", "false").lower() == "true" if is_local_debug: try: import datetime timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") debug_filename = f"debug_input_{timestamp}.png" Image.open(io.BytesIO(base64.b64decode(image_data))).save(debug_filename) print(f"šŸ› DEBUG: Saved input image as {debug_filename}") except Exception as save_error: print(f"Could not save debug image: {save_error}") # Process with GPT-4 Vision try: print("Calling GPT-4 Vision API...") response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_data}" } }, { "type": "text", "text": "Extract all text from this image. Include every word, number, symbol, and mathematical notation. If there are multiple questions or sections, preserve the structure. Use LaTeX for mathematical expressions." } ] } ], max_tokens=1000, temperature=0.3 ) extracted_text = response.choices[0].message.content.strip() print(f"GPT-4 Vision extraction completed. Text length: {len(extracted_text)}") if is_local_debug: print(f"šŸ› DEBUG: Extracted text: '{extracted_text}'") return extracted_text except Exception as ocr_error: error_msg = f"Error: GPT-4 Vision processing failed - {str(ocr_error)}" print(f"āŒ {error_msg}") import traceback print(f"āŒ Full traceback: {traceback.format_exc()}") return error_msg except Exception as e: return f"Error: Unexpected error - {str(e)}" def compare_ocr_with_crop(image_data: str, crop_region: Tuple[Tuple[int, int], Tuple[int, int]]) -> dict: """ Compare OCR results between cropped and uncropped versions of an image using GPT-4 Vision Args: image_data: Base64 encoded image data crop_region: Tuple of ((x1, y1), (x2, y2)) coordinates for cropping Returns: Dictionary containing comparison results """ print("šŸ” Starting OCR comparison: cropped vs uncropped (GPT-4 Vision)") print(f"šŸ“ Crop region: {crop_region[0]} to {crop_region[1]}") # Process uncropped image print("\nšŸ“„ Processing uncropped image...") uncropped_result = process_image_with_gpt4(image_data, crop_region=None) # Process cropped image print("\nāœ‚ļø Processing cropped image...") cropped_result = process_image_with_gpt4(image_data, crop_region=crop_region) # Analyze results results = { "crop_region": crop_region, "uncropped": { "text": uncropped_result, "length": len(uncropped_result), "is_error": uncropped_result.startswith("Error") }, "cropped": { "text": cropped_result, "length": len(cropped_result), "is_error": cropped_result.startswith("Error") }, "comparison": { "texts_match": uncropped_result == cropped_result, "length_difference": len(cropped_result) - len(uncropped_result), "cropped_has_more_text": len(cropped_result) > len(uncropped_result), "both_successful": not (uncropped_result.startswith("Error") or cropped_result.startswith("Error")) } } # Print comparison summary print("\nšŸ“Š OCR Comparison Results:") print(f"Uncropped text length: {results['uncropped']['length']} characters") print(f"Cropped text length: {results['cropped']['length']} characters") print(f"Length difference: {results['comparison']['length_difference']} characters") print(f"Texts match exactly: {results['comparison']['texts_match']}") print(f"Both OCR operations successful: {results['comparison']['both_successful']}") if results['comparison']['both_successful']: if results['comparison']['texts_match']: print("āœ… Cropped and uncropped images produced identical OCR results") elif results['comparison']['cropped_has_more_text']: print("šŸ“ˆ Cropped image produced more text than uncropped image") else: print("šŸ“‰ Cropped image produced less text than uncropped image") else: print("āŒ One or both OCR operations failed") return results def get_gpt_answer(question: str) -> str: """ Send the extracted question to GPT and get an answer """ try: if not client: return "Error: OpenAI client not initialized - check API key configuration" if not hasattr(client, 'api_key') or not client.api_key: return "Error: OpenAI API key not configured" response = client.chat.completions.create( model="gpt-4", messages=[ { "role": "system", "content": "You are a helpful assistant that provides concise, accurate answers to academic questions. Keep responses brief but informative." }, { "role": "user", "content": f"Please answer this question concisely: {question}" } ], max_tokens=500, temperature=0.3 ) return response.choices[0].message.content.strip() except Exception as e: return f"Error: {str(e)}" def process_screenshot_with_crop_comparison(image_data: str, crop_region: Tuple[Tuple[int, int], Tuple[int, int]]) -> dict: """ Process screenshot with crop comparison - extracts text from both cropped and uncropped versions and gets GPT answers for both, then compares the results. Args: image_data: Base64 encoded image data crop_region: Tuple of ((x1, y1), (x2, y2)) coordinates for cropping Returns: Dictionary containing OCR and GPT results for both versions """ print("šŸŽÆ Starting screenshot processing with crop comparison (GPT-4 Vision)") if not image_data: return {"error": "No image data provided"} # Get OCR comparison results ocr_comparison = compare_ocr_with_crop(image_data, crop_region) # Get GPT answers for both versions (if OCR was successful) gpt_answers = {} if not ocr_comparison["uncropped"]["is_error"]: print("\nšŸ¤– Getting GPT answer for uncropped text...") gpt_answers["uncropped"] = get_gpt_answer(ocr_comparison["uncropped"]["text"]) if not ocr_comparison["cropped"]["is_error"]: print("\nšŸ¤– Getting GPT answer for cropped text...") gpt_answers["cropped"] = get_gpt_answer(ocr_comparison["cropped"]["text"]) # Build final result result = { "crop_region": crop_region, "ocr_comparison": ocr_comparison, "gpt_answers": gpt_answers, "success": True } # Add comparison insights if "uncropped" in gpt_answers and "cropped" in gpt_answers: answers_match = gpt_answers["uncropped"] == gpt_answers["cropped"] result["comparison"] = { "gpt_answers_match": answers_match, "ocr_texts_match": ocr_comparison["comparison"]["texts_match"] } print(f"\nšŸŽÆ Final Comparison:") print(f"OCR texts match: {ocr_comparison['comparison']['texts_match']}") print(f"GPT answers match: {answers_match}") if answers_match: print("āœ… Cropped and uncropped processing produced identical results") else: print("āš ļø Cropped and uncropped processing produced different results") return result def process_screenshot(image_data: str) -> dict: """ Main processing function that handles the entire pipeline """ if not image_data: return {"error": "No image data provided"} # Extract text from image using GPT-4 Vision extracted_text = process_image_with_gpt4(image_data) if extracted_text.startswith("Error"): return {"error": extracted_text} # Get GPT answer gpt_answer = get_gpt_answer(extracted_text) return { "extracted_text": extracted_text, "answer": gpt_answer, "success": True } # Gradio interface for testing def gradio_interface(image): """ Gradio interface for testing the GPT-4 Vision + GPT pipeline """ if image is None: return "No image provided", "No answer available" # Convert PIL image to base64 buffered = io.BytesIO() image.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() result = process_screenshot(img_str) if "error" in result: return result["error"], "Error occurred" return result["extracted_text"], result["answer"] # Create Gradio app demo = gr.Interface( fn=gradio_interface, inputs=gr.Image(type="pil", label="Upload Screenshot"), outputs=[ gr.Textbox(label="Extracted Text", lines=3), gr.Textbox(label="GPT Answer", lines=5) ], title="Educational Question Solver", description="Upload a screenshot of a question and get an AI-powered answer for educational purposes.", examples=None ) # FastAPI setup for proper API endpoints from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import uvicorn app = FastAPI(title="Educational Question Solver API") # Add CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) class ImageRequest(BaseModel): image_data: str @app.post("/api/process") async def process_image_endpoint(request: ImageRequest): """ API endpoint that the Electron app will call """ try: result = process_screenshot(request.image_data) return result except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/") async def root(): return {"message": "Educational Question Solver API is running"} @app.get("/health") async def health_check(): return { "status": "healthy", "openai_client": client is not None, "ocr_model": "gpt-4o-vision", "local_debug": os.getenv("LOCAL_DEBUG", "false").lower() == "true", "openai_api_key_length": len(os.getenv("OPENAI_API_KEY", "")) if os.getenv("OPENAI_API_KEY") else 0 } @app.get("/test-openai") async def test_openai(): """Test endpoint to check OpenAI functionality""" if not client: return {"error": "OpenAI client not initialized"} try: # Simple test call response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Say 'Hello'"}], max_tokens=10 ) return { "success": True, "response": response.choices[0].message.content.strip() } except Exception as e: return {"error": f"OpenAI test failed: {str(e)}"} @app.get("/test-vision") async def test_vision(): """Test endpoint to check GPT-4 Vision functionality""" if not client: return {"error": "OpenAI client not initialized"} try: # Create a simple test image from PIL import Image, ImageDraw test_image = Image.new('RGB', (200, 100), color='white') draw = ImageDraw.Draw(test_image) draw.text((10, 10), "TEST IMAGE", fill='black') # Convert to base64 buffered = io.BytesIO() test_image.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() # Test with GPT-4 Vision response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_str}" } }, { "type": "text", "text": "What text do you see in this image?" } ] } ], max_tokens=100 ) return { "success": True, "response": response.choices[0].message.content.strip() } except Exception as e: return {"error": f"Vision test failed: {str(e)}"} # Mount Gradio app app = gr.mount_gradio_app(app, demo, path="/gradio") if __name__ == "__main__": # For local testing uvicorn.run(app, host="0.0.0.0", port=7860)