Spaces:

OmarAbualrob
/

ocr-api

Paused

App Files Files Community

OmarAbualrob commited on Jul 29, 2025

Commit

2a409bc

verified ·

1 Parent(s): a039eef

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -20

app.py CHANGED Viewed

@@ -14,19 +14,31 @@ logger = logging.getLogger(__name__)
 app = FastAPI(title="Mixed-Content OCR API", description="An API to extract text from images containing both printed and handwritten text.")
 # --- 2. Load the Model and Processor (at startup) ---
-# This is a critical step. We load the model only once when the app starts.
-# This prevents reloading the model on every API call, which would be very slow.
 try:
     logger.info("Loading model and processor...")
-    # Use the large model for better accuracy
     model_id = "microsoft/Florence-2-large"
-    # NOTE: We need to trust remote code for Florence-2
-    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
     processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
     logger.info("Model and processor loaded successfully.")
 except Exception as e:
     logger.error(f"Error loading model: {e}")
-    # If the model fails to load, the API is not usable. We can't proceed.
     model = None
     processor = None
@@ -38,31 +50,27 @@ def run_ocr(image: Image.Image) -> str:
     if model is None or processor is None:
         raise RuntimeError("Model is not available. Check logs for loading errors.")
-    # Ensure image is in RGB format
     if image.mode != "RGB":
         image = image.convert("RGB")
-    # Define the task prompt
     prompt = "<OCR>"
     # Preprocess the image and prompt
     inputs = processor(text=prompt, images=image, return_tensors="pt")
-    # Generate text from the image
-    # Note: max_new_tokens can be adjusted based on expected text length
     generated_ids = model.generate(
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
-        max_new_tokens=4096, # Increased token limit for long documents
-        do_sample=False, # Use greedy decoding for deterministic output
         num_beams=3
     )
-    # Decode the generated IDs to a string
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-    # Post-process the output to get the clean text
-    # The model's output for OCR is typically in the format: <OCR>extracted_text</s>
     parsed_text = processor.post_process_generation(generated_text, task="<OCR>", image_size=(image.width, image.height))
     return parsed_text.get("<OCR>", "Error: Could not parse OCR output.")
@@ -78,21 +86,17 @@ async def perform_ocr(file: UploadFile = File(..., description="Image file to pe
     if model is None:
         raise HTTPException(status_code=503, detail="Model is not loaded or unavailable.")
-    # Validate file type
     if not file.content_type.startswith("image/"):
         raise HTTPException(status_code=400, detail="Invalid file type. Please upload an image.")
     try:
-        # Read the image content from the uploaded file
         contents = await file.read()
         image = Image.open(io.BytesIO(contents))
-        # Run the OCR task
         logger.info("Running OCR on the uploaded image...")
         extracted_text = run_ocr(image)
         logger.info("OCR completed successfully.")
-        # Return the result
         return JSONResponse(
             content={"filename": file.filename, "text": extracted_text}
         )
@@ -106,4 +110,4 @@ def read_root():
     """
     A simple health check endpoint to confirm the API is running.
     """
-    return {"status": "ok", "model_loaded": model is not None}

 app = FastAPI(title="Mixed-Content OCR API", description="An API to extract text from images containing both printed and handwritten text.")
 # --- 2. Load the Model and Processor (at startup) ---
+# A. Set up the device to use the GPU (T4) if available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {device}")
+# B. Use a memory-efficient dtype for the T4 GPU
+torch_dtype = torch.bfloat16 # T4 GPUs are optimized for bfloat16
 try:
     logger.info("Loading model and processor...")
     model_id = "microsoft/Florence-2-large"
+    # C. Load the model with the specified dtype and send it to the GPU
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        trust_remote_code=True,
+        torch_dtype=torch_dtype
+    ).to(device) # <-- Send the model to the GPU
     processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
     logger.info("Model and processor loaded successfully.")
 except Exception as e:
     logger.error(f"Error loading model: {e}")
+    # If the model fails to load, the API is not usable.
     model = None
     processor = None
     if model is None or processor is None:
         raise RuntimeError("Model is not available. Check logs for loading errors.")
     if image.mode != "RGB":
         image = image.convert("RGB")
     prompt = "<OCR>"
     # Preprocess the image and prompt
     inputs = processor(text=prompt, images=image, return_tensors="pt")
+    # D. IMPORTANT: Move the input tensors to the same device as the model (the GPU)
+    inputs = {k: v.to(device, dtype=torch_dtype if k == "pixel_values" else v.dtype) for k, v in inputs.items()}
     generated_ids = model.generate(
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
+        max_new_tokens=4096,
+        do_sample=False,
         num_beams=3
     )
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
     parsed_text = processor.post_process_generation(generated_text, task="<OCR>", image_size=(image.width, image.height))
     return parsed_text.get("<OCR>", "Error: Could not parse OCR output.")
     if model is None:
         raise HTTPException(status_code=503, detail="Model is not loaded or unavailable.")
     if not file.content_type.startswith("image/"):
         raise HTTPException(status_code=400, detail="Invalid file type. Please upload an image.")
     try:
         contents = await file.read()
         image = Image.open(io.BytesIO(contents))
         logger.info("Running OCR on the uploaded image...")
         extracted_text = run_ocr(image)
         logger.info("OCR completed successfully.")
         return JSONResponse(
             content={"filename": file.filename, "text": extracted_text}
         )
     """
     A simple health check endpoint to confirm the API is running.
     """
+    return {"status": "ok", "model_loaded": model is not None, "device": device}