Spaces:

chitrark
/

olmocr-api

Running on CPU Upgrade

App Files Files Community

chitrark commited on Dec 20, 2025

Commit

778e3eb

verified ·

1 Parent(s): 097f30f

updated ocr path

Browse files

Files changed (1) hide show

app.py +56 -33

app.py CHANGED Viewed

@@ -10,23 +10,33 @@ from PIL import Image
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForVision2Seq
-# Suppress ALL startup noise BEFORE any imports
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["TRANSFORMERS_VERBOSITY"] = "error"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 warnings.filterwarnings("ignore")
 MODEL_ID = "allenai/olmOCR-2-7B-1025"
 processor = None
 model = None
 def load_model():
     global processor, model
     if processor is not None and model is not None:
         return
-    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
     model = AutoModelForVision2Seq.from_pretrained(
         MODEL_ID,
         dtype=torch.float16,
@@ -34,9 +44,13 @@ def load_model():
         low_cpu_mem_usage=True,
         trust_remote_code=True,
     ).eval()
-    print("✅ Model loaded successfully!")
 def _resize_max_side(img: Image.Image, max_side: int = 896) -> Image.Image:
     w, h = img.size
     m = max(w, h)
@@ -52,16 +66,15 @@ def build_prompt(width: int, height: int) -> str:
         "Return ONLY the extracted text (no explanations, no markdown).\n"
         "Do not hallucinate.\n"
         "RAW_TEXT_START\n"
-        f"Page dimensions: {width:.1f}x{height:.1f} [Image 0x0 to {width:.1f}x{height:.1f}]\n"
         "RAW_TEXT_END"
     )
 def _coerce_to_pil(img: Union[Image.Image, dict, str]) -> Image.Image:
     """
-    Gradio UI often passes a PIL Image.
-    gradio_client often passes a dict like {"path": "..."} or a string path.
-    This function normalizes everything into a PIL Image.
     """
     if isinstance(img, Image.Image):
         return img
@@ -70,50 +83,54 @@ def _coerce_to_pil(img: Union[Image.Image, dict, str]) -> Image.Image:
         return Image.open(img)
     if isinstance(img, dict):
-        # gradio_client image payload typically includes "path"
         path = img.get("path")
         if path:
             return Image.open(path)
-        # sometimes it may include "url" (less common)
         url = img.get("url")
         if url and url.startswith("data:image"):
-            header, b64 = url.split(",", 1)
-            data = base64.b64decode(b64)
-            return Image.open(BytesIO(data))
-    raise ValueError(f"Unsupported image input type: {type(img)} / {img}")
 def ocr_image(img: Union[Image.Image, dict, str]) -> tuple[str, str]:
     if img is None:
         return "No image uploaded.", "0.0s"
-    start_time = time.perf_counter()
     load_model()
-    # ✅ Normalize input (fixes API calls crashing)
     try:
         img = _coerce_to_pil(img)
     except Exception as e:
-        return f"Bad image input: {e}", "0.0s"
     img = img.convert("RGB")
-    img = _resize_max_side(img, max_side=896)
     w, h = img.size
     buf = BytesIO()
     img.save(buf, format="PNG")
-    image_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
-    prompt = build_prompt(w, h)
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
             ],
         }
     ]
@@ -131,8 +148,10 @@ def ocr_image(img: Union[Image.Image, dict, str]) -> tuple[str, str]:
         return_tensors="pt",
     )
-    # Move inputs to model device
-    inputs = {k: v.to(model.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
     with torch.inference_mode():
         output_ids = model.generate(
@@ -143,19 +162,22 @@ def ocr_image(img: Union[Image.Image, dict, str]) -> tuple[str, str]:
     prompt_len = inputs["input_ids"].shape[1]
     gen_ids = output_ids[:, prompt_len:]
-    text_out = processor.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
-    result = text_out[0].strip() if text_out else "No text extracted."
-    elapsed = time.perf_counter() - start_time
-    timing = f"{elapsed:.2f}s"
-    return result, timing
 with gr.Blocks(title="BookReader OCR API (olmOCR2)") as demo:
     gr.Markdown(
-        "# BookReader OCR API (olmOCR2)\n"
-        "Upload an image → get extracted text + timing.\n\n"
-        "**API endpoint:** `//ocr` (note the double slash)"
     )
     with gr.Row():
@@ -168,10 +190,11 @@ with gr.Blocks(title="BookReader OCR API (olmOCR2)") as demo:
     run_btn.click(
         fn=ocr_image,
-        inputs=[image_input],
         outputs=[output, timing],
-        api_name="/ocr",   # ✅ match what your client discovered
     )
 if __name__ == "__main__":
     demo.queue().launch(show_error=True)

 import gradio as gr
 from transformers import AutoProcessor, AutoModelForVision2Seq
+# -----------------------------------------------------------------------------
+# Environment + warnings (quiet startup)
+# -----------------------------------------------------------------------------
 os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["TRANSFORMERS_VERBOSITY"] = "error"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 warnings.filterwarnings("ignore")
+# -----------------------------------------------------------------------------
+# Model config
+# -----------------------------------------------------------------------------
 MODEL_ID = "allenai/olmOCR-2-7B-1025"
 processor = None
 model = None
 def load_model():
+    """Lazy-load model so Space boots fast."""
     global processor, model
     if processor is not None and model is not None:
         return
+    processor = AutoProcessor.from_pretrained(
+        MODEL_ID,
+        trust_remote_code=True,
+    )
     model = AutoModelForVision2Seq.from_pretrained(
         MODEL_ID,
         dtype=torch.float16,
         low_cpu_mem_usage=True,
         trust_remote_code=True,
     ).eval()
+    print("✅ olmOCR-2 model loaded")
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
 def _resize_max_side(img: Image.Image, max_side: int = 896) -> Image.Image:
     w, h = img.size
     m = max(w, h)
         "Return ONLY the extracted text (no explanations, no markdown).\n"
         "Do not hallucinate.\n"
         "RAW_TEXT_START\n"
+        f"Page dimensions: {width:.1f}x{height:.1f} "
+        f"[Image 0x0 to {width:.1f}x{height:.1f}]\n"
         "RAW_TEXT_END"
     )
 def _coerce_to_pil(img: Union[Image.Image, dict, str]) -> Image.Image:
     """
+    Normalize Gradio UI input and gradio_client input into a PIL Image.
     """
     if isinstance(img, Image.Image):
         return img
         return Image.open(img)
     if isinstance(img, dict):
         path = img.get("path")
         if path:
             return Image.open(path)
         url = img.get("url")
         if url and url.startswith("data:image"):
+            _, b64 = url.split(",", 1)
+            return Image.open(BytesIO(base64.b64decode(b64)))
+    raise ValueError(f"Unsupported image input: {type(img)}")
+# -----------------------------------------------------------------------------
+# OCR function (API)
+# -----------------------------------------------------------------------------
 def ocr_image(img: Union[Image.Image, dict, str]) -> tuple[str, str]:
     if img is None:
         return "No image uploaded.", "0.0s"
+    start = time.perf_counter()
     load_model()
     try:
         img = _coerce_to_pil(img)
     except Exception as e:
+        return f"Invalid image input: {e}", "0.0s"
     img = img.convert("RGB")
+    img = _resize_max_side(img)
     w, h = img.size
+    # Build prompt
+    prompt = build_prompt(w, h)
+    # Encode image for VLM message
     buf = BytesIO()
     img.save(buf, format="PNG")
+    image_b64 = base64.b64encode(buf.getvalue()).decode()
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                },
             ],
         }
     ]
         return_tensors="pt",
     )
+    inputs = {
+        k: v.to(model.device) if torch.is_tensor(v) else v
+        for k, v in inputs.items()
+    }
     with torch.inference_mode():
         output_ids = model.generate(
     prompt_len = inputs["input_ids"].shape[1]
     gen_ids = output_ids[:, prompt_len:]
+    text = processor.tokenizer.batch_decode(
+        gen_ids, skip_special_tokens=True
+    )
+    elapsed = time.perf_counter() - start
+    return (text[0].strip() if text else "No text extracted.", f"{elapsed:.2f}s")
+# -----------------------------------------------------------------------------
+# Gradio UI + API
+# -----------------------------------------------------------------------------
 with gr.Blocks(title="BookReader OCR API (olmOCR2)") as demo:
     gr.Markdown(
+        "# 📖 BookReader OCR API (olmOCR2)\n"
+        "Upload an image and extract text using **olmOCR-2-7B**.\n\n"
+        "**API endpoint:** `/ocr`"
     )
     with gr.Row():
     run_btn.click(
         fn=ocr_image,
+        inputs=image_input,
         outputs=[output, timing],
+        api_name="/ocr",
     )
 if __name__ == "__main__":
     demo.queue().launch(show_error=True)