Spaces:

OmarAbualrob
/

ocr-api

Paused

App Files Files Community

OmarAbualrob commited on Jul 29, 2025

Commit

f8797e4

verified ·

1 Parent(s): 29fc161

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -40

app.py CHANGED Viewed

@@ -2,83 +2,108 @@ from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.responses import JSONResponse
 from transformers import AutoProcessor, AutoModelForCausalLM
 from PIL import Image
 import io
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(title="Florence-2 OCR API (CPU)", description="An API to extract text from images using the Florence-2-large model on CPU.")
-# --- Global Variables and Device Configuration ---
-device = "cpu"  # Force CPU
-model = None
-processor = None
-# --- Model Loading Logic (at startup) ---
-@app.on_event("startup")
-async def startup_event():
-    global model, processor
-    try:
-        logger.info(f"Using device: {device}")
-        logger.info("Starting model loading process for CPU...")
-        model_id = "microsoft/Florence-2-large"
-        # Load the model in full precision for CPU
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-        )
-        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-        logger.info("Model and processor loaded successfully on CPU.")
-    except Exception as e:
-        logger.error(f"FATAL: An error occurred during model loading: {e}", exc_info=True)
-# --- Define the OCR Task Function (CPU version) ---
 def run_ocr(image: Image.Image) -> str:
     if model is None or processor is None:
-        raise RuntimeError("Model is not available. Check startup logs for loading errors.")
     if image.mode != "RGB":
         image = image.convert("RGB")
     prompt = "<OCR>"
     inputs = processor(text=prompt, images=image, return_tensors="pt")
-    # Generate on CPU (no .to(device) or dtype changes needed)
     generated_ids = model.generate(
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
-        max_new_tokens=4096,
-        do_sample=False,
         num_beams=3
     )
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
     parsed_text = processor.post_process_generation(generated_text, task="<OCR>", image_size=(image.width, image.height))
     return parsed_text.get("<OCR>", "Error: Could not parse OCR output.")
-# --- API Endpoints ---
-# (Your @app.post and @app.get endpoints remain exactly the same)
 @app.post("/ocr", summary="Extract Text from Image")
 async def perform_ocr(file: UploadFile = File(..., description="Image file to perform OCR on.")):
     if model is None:
-        raise HTTPException(status_code=503, detail="Model is not loaded or unavailable. Please check the server logs.")
     if not file.content_type.startswith("image/"):
         raise HTTPException(status_code=400, detail="Invalid file type. Please upload an image.")
     try:
         contents = await file.read()
         image = Image.open(io.BytesIO(contents))
-        logger.info(f"Running OCR on uploaded file: {file.filename}")
         extracted_text = run_ocr(image)
         logger.info("OCR completed successfully.")
-        return JSONResponse(content={"filename": file.filename, "text": extracted_text})
     except Exception as e:
-        logger.error(f"An error occurred during OCR processing for {file.filename}: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail=f"An internal server error occurred: {str(e)}")
 @app.get("/", summary="Health Check")
 def read_root():
-    return {"status": "ok", "model_loaded": model is not None, "device": device}

 from fastapi.responses import JSONResponse
 from transformers import AutoProcessor, AutoModelForCausalLM
 from PIL import Image
+import torch
 import io
 import logging
+# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# --- 1. Initialize FastAPI App ---
+app = FastAPI(title="Mixed-Content OCR API", description="An API to extract text from images containing both printed and handwritten text.")
+# --- 2. Load the Model and Processor (at startup) ---
+# This is a critical step. We load the model only once when the app starts.
+# This prevents reloading the model on every API call, which would be very slow.
+try:
+    logger.info("Loading model and processor...")
+    # Use the large model for better accuracy
+    model_id = "microsoft/Florence-2-large"
+    # NOTE: We need to trust remote code for Florence-2
+    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+    logger.info("Model and processor loaded successfully.")
+except Exception as e:
+    logger.error(f"Error loading model: {e}")
+    # If the model fails to load, the API is not usable. We can't proceed.
+    model = None
+    processor = None
+# --- 3. Define the OCR Task Function ---
 def run_ocr(image: Image.Image) -> str:
+    """
+    Performs OCR on a given PIL Image using the Florence-2 model.
+    """
     if model is None or processor is None:
+        raise RuntimeError("Model is not available. Check logs for loading errors.")
+    # Ensure image is in RGB format
     if image.mode != "RGB":
         image = image.convert("RGB")
+    # Define the task prompt
     prompt = "<OCR>"
+    # Preprocess the image and prompt
     inputs = processor(text=prompt, images=image, return_tensors="pt")
+    # Generate text from the image
+    # Note: max_new_tokens can be adjusted based on expected text length
     generated_ids = model.generate(
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
+        max_new_tokens=4096, # Increased token limit for long documents
+        do_sample=False, # Use greedy decoding for deterministic output
         num_beams=3
     )
+    # Decode the generated IDs to a string
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    # Post-process the output to get the clean text
+    # The model's output for OCR is typically in the format: <OCR>extracted_text</s>
     parsed_text = processor.post_process_generation(generated_text, task="<OCR>", image_size=(image.width, image.height))
     return parsed_text.get("<OCR>", "Error: Could not parse OCR output.")
+# --- 4. Create the API Endpoint ---
 @app.post("/ocr", summary="Extract Text from Image")
 async def perform_ocr(file: UploadFile = File(..., description="Image file to perform OCR on.")):
+    """
+    Takes an image file, extracts both printed and handwritten text,
+    and returns it as a JSON object.
+    """
     if model is None:
+        raise HTTPException(status_code=503, detail="Model is not loaded or unavailable.")
+    # Validate file type
     if not file.content_type.startswith("image/"):
         raise HTTPException(status_code=400, detail="Invalid file type. Please upload an image.")
     try:
+        # Read the image content from the uploaded file
         contents = await file.read()
         image = Image.open(io.BytesIO(contents))
+        # Run the OCR task
+        logger.info("Running OCR on the uploaded image...")
         extracted_text = run_ocr(image)
         logger.info("OCR completed successfully.")
+        # Return the result
+        return JSONResponse(
+            content={"filename": file.filename, "text": extracted_text}
+        )
     except Exception as e:
+        logger.error(f"An error occurred during OCR processing: {e}")
+        raise HTTPException(status_code=500, detail=f"An internal error occurred: {str(e)}")
 @app.get("/", summary="Health Check")
 def read_root():
+    """
+    A simple health check endpoint to confirm the API is running.
+    """
+    return {"status": "ok", "model_loaded": model is not None}