Spaces:

chitrark
/

olmocr-api

Running on CPU Upgrade

App Files Files Community

chitrark commited on Dec 20, 2025

Commit

097f30f

verified ·

1 Parent(s): ffb2e43

updated to fix some issues

Browse files

Files changed (1) hide show

app.py +42 -9

app.py CHANGED Viewed

@@ -2,7 +2,8 @@ import os
 import base64
 from io import BytesIO
 import warnings
-import time  # For timing
 import torch
 from PIL import Image
@@ -28,7 +29,7 @@ def load_model():
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
     model = AutoModelForVision2Seq.from_pretrained(
         MODEL_ID,
-        dtype=torch.float16,  # Fixed deprecation
         device_map="auto",
         low_cpu_mem_usage=True,
         trust_remote_code=True,
@@ -56,13 +57,47 @@ def build_prompt(width: int, height: int) -> str:
     )
-def ocr_image(img: Image.Image) -> tuple[str, str]:
     if img is None:
         return "No image uploaded.", "0.0s"
-    start_time = time.perf_counter()  # High-precision timer
     load_model()
     img = img.convert("RGB")
     img = _resize_max_side(img, max_side=896)
     w, h = img.size
@@ -95,7 +130,7 @@ def ocr_image(img: Image.Image) -> tuple[str, str]:
         padding=True,
         return_tensors="pt",
     )
     # Move inputs to model device
     inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
@@ -113,7 +148,6 @@ def ocr_image(img: Image.Image) -> tuple[str, str]:
     elapsed = time.perf_counter() - start_time
     timing = f"{elapsed:.2f}s"
     return result, timing
@@ -121,7 +155,7 @@ with gr.Blocks(title="BookReader OCR API (olmOCR2)") as demo:
     gr.Markdown(
         "# BookReader OCR API (olmOCR2)\n"
         "Upload an image → get extracted text + timing.\n\n"
-        "**API endpoint:** `/ocr`"
     )
     with gr.Row():
@@ -136,9 +170,8 @@ with gr.Blocks(title="BookReader OCR API (olmOCR2)") as demo:
         fn=ocr_image,
         inputs=[image_input],
         outputs=[output, timing],
-        api_name="/ocr",
     )
 if __name__ == "__main__":
     demo.queue().launch(show_error=True)

 import base64
 from io import BytesIO
 import warnings
+import time
+from typing import Union
 import torch
 from PIL import Image
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
     model = AutoModelForVision2Seq.from_pretrained(
         MODEL_ID,
+        dtype=torch.float16,
         device_map="auto",
         low_cpu_mem_usage=True,
         trust_remote_code=True,
     )
+def _coerce_to_pil(img: Union[Image.Image, dict, str]) -> Image.Image:
+    """
+    Gradio UI often passes a PIL Image.
+    gradio_client often passes a dict like {"path": "..."} or a string path.
+    This function normalizes everything into a PIL Image.
+    """
+    if isinstance(img, Image.Image):
+        return img
+    if isinstance(img, str):
+        return Image.open(img)
+    if isinstance(img, dict):
+        # gradio_client image payload typically includes "path"
+        path = img.get("path")
+        if path:
+            return Image.open(path)
+        # sometimes it may include "url" (less common)
+        url = img.get("url")
+        if url and url.startswith("data:image"):
+            header, b64 = url.split(",", 1)
+            data = base64.b64decode(b64)
+            return Image.open(BytesIO(data))
+    raise ValueError(f"Unsupported image input type: {type(img)} / {img}")
+def ocr_image(img: Union[Image.Image, dict, str]) -> tuple[str, str]:
     if img is None:
         return "No image uploaded.", "0.0s"
+    start_time = time.perf_counter()
     load_model()
+    # ✅ Normalize input (fixes API calls crashing)
+    try:
+        img = _coerce_to_pil(img)
+    except Exception as e:
+        return f"Bad image input: {e}", "0.0s"
     img = img.convert("RGB")
     img = _resize_max_side(img, max_side=896)
     w, h = img.size
         padding=True,
         return_tensors="pt",
     )
     # Move inputs to model device
     inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
     elapsed = time.perf_counter() - start_time
     timing = f"{elapsed:.2f}s"
     return result, timing
     gr.Markdown(
         "# BookReader OCR API (olmOCR2)\n"
         "Upload an image → get extracted text + timing.\n\n"
+        "**API endpoint:** `//ocr` (note the double slash)"
     )
     with gr.Row():
         fn=ocr_image,
         inputs=[image_input],
         outputs=[output, timing],
+        api_name="/ocr",   # ✅ match what your client discovered
     )
 if __name__ == "__main__":
     demo.queue().launch(show_error=True)