cheating / app.py
awesomesauce10's picture
Update app.py
74420f7 verified
import os
import io
import base64
from PIL import Image
import gradio as gr
from openai import OpenAI
from typing import Optional, Tuple
def crop_image(image_path: str, top_left: Tuple[int, int], bottom_right: Tuple[int, int], output_path: Optional[str] = None) -> Image.Image:
"""
Crop an image using pixel coordinates.
Args:
image_path: Path to the input image file
top_left: Tuple of (x, y) coordinates for the top-left corner of the crop area
bottom_right: Tuple of (x, y) coordinates for the bottom-right corner of the crop area
output_path: Optional path to save the cropped image. If None, returns the cropped image.
Returns:
PIL Image object of the cropped image
Raises:
FileNotFoundError: If the input image file doesn't exist
ValueError: If coordinates are invalid
"""
# Validate input file exists
if not os.path.exists(image_path):
raise FileNotFoundError(f"Image file not found: {image_path}")
# Open the image
try:
image = Image.open(image_path)
except Exception as e:
raise ValueError(f"Failed to open image: {e}")
# Extract coordinates
x1, y1 = top_left
x2, y2 = bottom_right
# Validate coordinates
width, height = image.size
if x1 < 0 or y1 < 0 or x2 > width or y2 > height:
raise ValueError(f"Coordinates out of bounds. Image size: {width}x{height}, requested crop: ({x1},{y1}) to ({x2},{y2})")
if x2 <= x1 or y2 <= y1:
raise ValueError(f"Invalid crop dimensions. Bottom-right must be greater than top-left: ({x1},{y1}) to ({x2},{y2})")
# PIL crop box format: (left, upper, right, lower)
crop_box = (x1, y1, x2, y2)
# Crop the image
cropped_image = image.crop(crop_box)
# Save if output path provided
if output_path:
try:
cropped_image.save(output_path)
print(f"Cropped image saved to: {output_path}")
except Exception as e:
raise ValueError(f"Failed to save cropped image: {e}")
return cropped_image
# Initialize OpenAI client with error handling
def get_openai_client():
# Try multiple ways to get the API key
api_key = None
# Method 1: Environment variable (most common in Hugging Face Spaces)
api_key = os.getenv("OPENAI_API_KEY")
# Method 2: Try reading from secrets file (Hugging Face Spaces)
if not api_key:
try:
secrets_file = "/app/secrets.json" # Hugging Face Spaces secrets location
if os.path.exists(secrets_file):
import json
with open(secrets_file, 'r') as f:
secrets = json.load(f)
api_key = secrets.get("OPENAI_API_KEY")
except Exception as e:
print(f"Could not read secrets file: {e}")
# Method 3: For local development, try loading .env file
if not api_key:
try:
from dotenv import load_dotenv
load_dotenv() # Load .env file from current directory
api_key = os.getenv("OPENAI_API_KEY")
if api_key:
print("Loaded API key from .env file (local development)")
except ImportError:
print("python-dotenv not installed, skipping .env file loading")
except Exception as e:
print(f"Could not load .env file: {e}")
# Debug logging
if api_key:
print(f"Found API key (length: {len(api_key)})")
else:
print("No API key found in environment variables, secrets, or .env file")
if not api_key:
return None
try:
client = OpenAI(api_key=api_key.strip()) # Strip any whitespace
print("OpenAI client initialized successfully")
return client
except Exception as e:
print(f"Failed to initialize OpenAI client: {e}")
return None
client = get_openai_client()
def process_image_with_gpt4(image_data: str, crop_region: Optional[Tuple[Tuple[int, int], Tuple[int, int]]] = None) -> str:
"""
Process image using GPT-4 vision API for OCR/text extraction
Args:
image_data: Base64 encoded image data
crop_region: Optional tuple of ((x1, y1), (x2, y2)) coordinates to crop the image before OCR
"""
try:
if not client:
return "Error: OpenAI client not initialized - check API key"
print(f"Processing image data of length: {len(image_data)}")
# Handle data URL format (data:image/png;base64,...)
if image_data.startswith('data:'):
# Extract the base64 part after the comma
if ',' in image_data:
image_data = image_data.split(',')[1]
else:
return "Error: Invalid data URL format - no comma found"
# Validate base64
try:
image_bytes = base64.b64decode(image_data)
print(f"Decoded image bytes length: {len(image_bytes)}")
except Exception as decode_error:
return f"Error: Failed to decode base64 - {str(decode_error)}"
# Verify it's a valid image
try:
image = Image.open(io.BytesIO(image_bytes))
print(f"Image validated: {image.size}, mode: {image.mode}")
except Exception as image_error:
return f"Error: Failed to validate image - {str(image_error)}"
# Crop image if crop_region is specified
if crop_region:
try:
top_left, bottom_right = crop_region
print(f"Applying crop with coordinates: {top_left} to {bottom_right}")
# PIL crop box format: (left, upper, right, lower)
crop_box = (top_left[0], top_left[1], bottom_right[0], bottom_right[1])
# Validate crop coordinates
width, height = image.size
x1, y1, x2, y2 = crop_box
if x1 < 0 or y1 < 0 or x2 > width or y2 > height:
print(f"Warning: Crop coordinates partially out of bounds. Image size: {width}x{height}, requested crop: {crop_box}")
# Adjust coordinates to stay within bounds
x1 = max(0, min(x1, width))
y1 = max(0, min(y1, height))
x2 = max(0, min(x2, width))
y2 = max(0, min(y2, height))
crop_box = (x1, y1, x2, y2)
if x2 > x1 and y2 > y1:
original_size = image.size
image = image.crop(crop_box)
print(f"Image cropped from {original_size} to {image.size}")
# Convert cropped image back to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
image_data = base64.b64encode(buffered.getvalue()).decode()
else:
print(f"Warning: Invalid crop dimensions, skipping crop: {crop_box}")
except Exception as crop_error:
print(f"Warning: Failed to crop image: {crop_error}")
# Continue with original image
# Save image for local debugging if LOCAL_DEBUG is enabled
is_local_debug = os.getenv("LOCAL_DEBUG", "false").lower() == "true"
if is_local_debug:
try:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
debug_filename = f"debug_input_{timestamp}.png"
Image.open(io.BytesIO(base64.b64decode(image_data))).save(debug_filename)
print(f"🐛 DEBUG: Saved input image as {debug_filename}")
except Exception as save_error:
print(f"Could not save debug image: {save_error}")
# Process with GPT-4 Vision
try:
print("Calling GPT-4 Vision API...")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_data}"
}
},
{
"type": "text",
"text": "Extract all text from this image. Include every word, number, symbol, and mathematical notation. If there are multiple questions or sections, preserve the structure. Use LaTeX for mathematical expressions."
}
]
}
],
max_tokens=1000,
temperature=0.3
)
extracted_text = response.choices[0].message.content.strip()
print(f"GPT-4 Vision extraction completed. Text length: {len(extracted_text)}")
if is_local_debug:
print(f"🐛 DEBUG: Extracted text: '{extracted_text}'")
return extracted_text
except Exception as ocr_error:
error_msg = f"Error: GPT-4 Vision processing failed - {str(ocr_error)}"
print(f"❌ {error_msg}")
import traceback
print(f"❌ Full traceback: {traceback.format_exc()}")
return error_msg
except Exception as e:
return f"Error: Unexpected error - {str(e)}"
def compare_ocr_with_crop(image_data: str, crop_region: Tuple[Tuple[int, int], Tuple[int, int]]) -> dict:
"""
Compare OCR results between cropped and uncropped versions of an image using GPT-4 Vision
Args:
image_data: Base64 encoded image data
crop_region: Tuple of ((x1, y1), (x2, y2)) coordinates for cropping
Returns:
Dictionary containing comparison results
"""
print("🔍 Starting OCR comparison: cropped vs uncropped (GPT-4 Vision)")
print(f"📏 Crop region: {crop_region[0]} to {crop_region[1]}")
# Process uncropped image
print("\n📄 Processing uncropped image...")
uncropped_result = process_image_with_gpt4(image_data, crop_region=None)
# Process cropped image
print("\n✂️ Processing cropped image...")
cropped_result = process_image_with_gpt4(image_data, crop_region=crop_region)
# Analyze results
results = {
"crop_region": crop_region,
"uncropped": {
"text": uncropped_result,
"length": len(uncropped_result),
"is_error": uncropped_result.startswith("Error")
},
"cropped": {
"text": cropped_result,
"length": len(cropped_result),
"is_error": cropped_result.startswith("Error")
},
"comparison": {
"texts_match": uncropped_result == cropped_result,
"length_difference": len(cropped_result) - len(uncropped_result),
"cropped_has_more_text": len(cropped_result) > len(uncropped_result),
"both_successful": not (uncropped_result.startswith("Error") or cropped_result.startswith("Error"))
}
}
# Print comparison summary
print("\n📊 OCR Comparison Results:")
print(f"Uncropped text length: {results['uncropped']['length']} characters")
print(f"Cropped text length: {results['cropped']['length']} characters")
print(f"Length difference: {results['comparison']['length_difference']} characters")
print(f"Texts match exactly: {results['comparison']['texts_match']}")
print(f"Both OCR operations successful: {results['comparison']['both_successful']}")
if results['comparison']['both_successful']:
if results['comparison']['texts_match']:
print("✅ Cropped and uncropped images produced identical OCR results")
elif results['comparison']['cropped_has_more_text']:
print("📈 Cropped image produced more text than uncropped image")
else:
print("📉 Cropped image produced less text than uncropped image")
else:
print("❌ One or both OCR operations failed")
return results
def get_gpt_answer(question: str) -> str:
"""
Send the extracted question to GPT and get an answer
"""
try:
if not client:
return "Error: OpenAI client not initialized - check API key configuration"
if not hasattr(client, 'api_key') or not client.api_key:
return "Error: OpenAI API key not configured"
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that provides concise, accurate answers to academic questions. Keep responses brief but informative."
},
{
"role": "user",
"content": f"Please answer this question concisely: {question}"
}
],
max_tokens=500,
temperature=0.3
)
return response.choices[0].message.content.strip()
except Exception as e:
return f"Error: {str(e)}"
def process_screenshot_with_crop_comparison(image_data: str, crop_region: Tuple[Tuple[int, int], Tuple[int, int]]) -> dict:
"""
Process screenshot with crop comparison - extracts text from both cropped and uncropped versions
and gets GPT answers for both, then compares the results.
Args:
image_data: Base64 encoded image data
crop_region: Tuple of ((x1, y1), (x2, y2)) coordinates for cropping
Returns:
Dictionary containing OCR and GPT results for both versions
"""
print("🎯 Starting screenshot processing with crop comparison (GPT-4 Vision)")
if not image_data:
return {"error": "No image data provided"}
# Get OCR comparison results
ocr_comparison = compare_ocr_with_crop(image_data, crop_region)
# Get GPT answers for both versions (if OCR was successful)
gpt_answers = {}
if not ocr_comparison["uncropped"]["is_error"]:
print("\n🤖 Getting GPT answer for uncropped text...")
gpt_answers["uncropped"] = get_gpt_answer(ocr_comparison["uncropped"]["text"])
if not ocr_comparison["cropped"]["is_error"]:
print("\n🤖 Getting GPT answer for cropped text...")
gpt_answers["cropped"] = get_gpt_answer(ocr_comparison["cropped"]["text"])
# Build final result
result = {
"crop_region": crop_region,
"ocr_comparison": ocr_comparison,
"gpt_answers": gpt_answers,
"success": True
}
# Add comparison insights
if "uncropped" in gpt_answers and "cropped" in gpt_answers:
answers_match = gpt_answers["uncropped"] == gpt_answers["cropped"]
result["comparison"] = {
"gpt_answers_match": answers_match,
"ocr_texts_match": ocr_comparison["comparison"]["texts_match"]
}
print(f"\n🎯 Final Comparison:")
print(f"OCR texts match: {ocr_comparison['comparison']['texts_match']}")
print(f"GPT answers match: {answers_match}")
if answers_match:
print("✅ Cropped and uncropped processing produced identical results")
else:
print("⚠️ Cropped and uncropped processing produced different results")
return result
def process_screenshot(image_data: str) -> dict:
"""
Main processing function that handles the entire pipeline
"""
if not image_data:
return {"error": "No image data provided"}
# Extract text from image using GPT-4 Vision
extracted_text = process_image_with_gpt4(image_data)
if extracted_text.startswith("Error"):
return {"error": extracted_text}
# Get GPT answer
gpt_answer = get_gpt_answer(extracted_text)
return {
"extracted_text": extracted_text,
"answer": gpt_answer,
"success": True
}
# Gradio interface for testing
def gradio_interface(image):
"""
Gradio interface for testing the GPT-4 Vision + GPT pipeline
"""
if image is None:
return "No image provided", "No answer available"
# Convert PIL image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
result = process_screenshot(img_str)
if "error" in result:
return result["error"], "Error occurred"
return result["extracted_text"], result["answer"]
# Create Gradio app
demo = gr.Interface(
fn=gradio_interface,
inputs=gr.Image(type="pil", label="Upload Screenshot"),
outputs=[
gr.Textbox(label="Extracted Text", lines=3),
gr.Textbox(label="GPT Answer", lines=5)
],
title="Educational Question Solver",
description="Upload a screenshot of a question and get an AI-powered answer for educational purposes.",
examples=None
)
# FastAPI setup for proper API endpoints
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import uvicorn
app = FastAPI(title="Educational Question Solver API")
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ImageRequest(BaseModel):
image_data: str
@app.post("/api/process")
async def process_image_endpoint(request: ImageRequest):
"""
API endpoint that the Electron app will call
"""
try:
result = process_screenshot(request.image_data)
return result
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/")
async def root():
return {"message": "Educational Question Solver API is running"}
@app.get("/health")
async def health_check():
return {
"status": "healthy",
"openai_client": client is not None,
"ocr_model": "gpt-4o-vision",
"local_debug": os.getenv("LOCAL_DEBUG", "false").lower() == "true",
"openai_api_key_length": len(os.getenv("OPENAI_API_KEY", "")) if os.getenv("OPENAI_API_KEY") else 0
}
@app.get("/test-openai")
async def test_openai():
"""Test endpoint to check OpenAI functionality"""
if not client:
return {"error": "OpenAI client not initialized"}
try:
# Simple test call
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Say 'Hello'"}],
max_tokens=10
)
return {
"success": True,
"response": response.choices[0].message.content.strip()
}
except Exception as e:
return {"error": f"OpenAI test failed: {str(e)}"}
@app.get("/test-vision")
async def test_vision():
"""Test endpoint to check GPT-4 Vision functionality"""
if not client:
return {"error": "OpenAI client not initialized"}
try:
# Create a simple test image
from PIL import Image, ImageDraw
test_image = Image.new('RGB', (200, 100), color='white')
draw = ImageDraw.Draw(test_image)
draw.text((10, 10), "TEST IMAGE", fill='black')
# Convert to base64
buffered = io.BytesIO()
test_image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
# Test with GPT-4 Vision
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{img_str}"
}
},
{
"type": "text",
"text": "What text do you see in this image?"
}
]
}
],
max_tokens=100
)
return {
"success": True,
"response": response.choices[0].message.content.strip()
}
except Exception as e:
return {"error": f"Vision test failed: {str(e)}"}
# Mount Gradio app
app = gr.mount_gradio_app(app, demo, path="/gradio")
if __name__ == "__main__":
# For local testing
uvicorn.run(app, host="0.0.0.0", port=7860)