Spaces:

chitrark
/

olmocr-api

Running on CPU Upgrade

App Files Files Community

chitrark commited on Dec 19, 2025

Commit

a82af97

verified ·

1 Parent(s): ce86bd1

refactor: replace broken olmocr convert_files with VLM-based OCR pipeline

Browse files

Files changed (1) hide show

app.py +107 -36

app.py CHANGED Viewed

@@ -1,57 +1,128 @@
-import tempfile
-import gradio as gr
-# Updated import: convert_files is exposed from the top-level package
-from olmocr.runner import convert_files   # replace with the real module name
 MODEL_NAME = "allenai/olmOCR-2-7B-1025"
-def ocr(file_obj):
     if file_obj is None:
-        return "No file uploaded."
-    in_path = file_obj.name
-    with tempfile.TemporaryDirectory() as tmpdir:
-        results = convert_files(
-            inputs=[in_path],
-            output_dir=tmpdir,
-            model_name=MODEL_NAME,
         )
-        if not results:
-            return "No output."
-        r0 = results[0]
-        # Try direct text
-        text = getattr(r0, "text", None)
-        # Fallback: read from output file
-        if not text:
-            out_path = getattr(r0, "output_path", None)
-            if out_path:
-                with open(out_path, "r", encoding="utf-8") as f:
-                    text = f.read()
-        return (text or "No text extracted.").strip()
-with gr.Blocks(title="BookReader OCR API (olmOCR2)") as demo:
     gr.Markdown(
-        "# BookReader OCR API (olmOCR2)\n"
-        "Upload an image or PDF → get extracted text.\n\n"
-        "**API endpoint:** `/ocr`"
     )
-    upload = gr.File(label="Upload PDF or image", file_count="single")
-    output = gr.Textbox(label="Extracted text", lines=18)
     gr.Button("Run OCR").click(
-        fn=ocr,
-        inputs=[upload],
         outputs=[output],
         api_name="/ocr",
     )

+import base64
+from io import BytesIO
+import torch
+from PIL import Image
+import gradio as gr
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+# Vision-language model used by olmOCR-2
 MODEL_NAME = "allenai/olmOCR-2-7B-1025"
+PROCESSOR_NAME = "Qwen/Qwen2-VL-7B-Instruct"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Loading model on", device)
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.bfloat16,
+).to(device).eval()
+processor = AutoProcessor.from_pretrained(PROCESSOR_NAME)
+def build_image_prompt(width: int, height: int) -> str:
+    """
+    Minimal 'document anchoring' style prompt for a single image on a page.
+    This follows the structure described in olmOCR docs/blogs:
+    page dimensions + image box, then 'RAW_TEXT_START/END'.
+    """
+    prompt = (
+        "Below is the image of one page of a document, as well as some raw textual "
+        "content that was previously extracted for it. "
+        "Just return the plain text representation of this document as if you "
+        "were reading it naturally. Do not hallucinate.\n"
+        "RAW_TEXT_START\n"
+        f"Page dimensions: {width:.1f}x{height:.1f} [Image 0x0 to {width:.1f}x{height:.1f}]\n"
+        "RAW_TEXT_END"
+    )
+    return prompt
+def ocr_image(file_obj: gr.File):
     if file_obj is None:
+        return "No image uploaded."
+    # Load the uploaded image
+    img = Image.open(file_obj).convert("RGB")
+    # Optionally resize to keep max side around 1024 for performance/quality
+    max_side = 1024
+    w, h = img.size
+    scale = min(max_side / max(w, h), 1.0)
+    if scale < 1.0:
+        img = img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
+        w, h = img.size
+    # Encode to base64 (needed to match typical VLM 'image_url' usage)
+    buf = BytesIO()
+    img.save(buf, format="PNG")
+    image_bytes = buf.getvalue()
+    image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+    # Build prompt for this image
+    prompt = build_image_prompt(w, h)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                },
+            ],
+        }
+    ]
+    # Apply chat template and preprocess
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    inputs = processor(
+        text=[text],
+        images=[img],
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Generate output
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            temperature=0.6,
+            max_new_tokens=512,
+            num_return_sequences=1,
+            do_sample=True,
         )
+    prompt_len = inputs["input_ids"].shape[1]
+    new_tokens = output[:, prompt_len:]
+    text_output = processor.tokenizer.batch_decode(
+        new_tokens, skip_special_tokens=True
+    )
+    return text_output[0].strip() if text_output else "No text extracted."
+with gr.Blocks(title="olmOCR‑2 Image OCR") as demo:
     gr.Markdown(
+        "# olmOCR‑2 Image OCR\n"
+        "Upload an image and get extracted text using the olmOCR‑2‑7B model."
     )
+    image_input = gr.Image(type="pil", label="Upload image")
+    output = gr.Textbox(label="Extracted text", lines=20)
     gr.Button("Run OCR").click(
+        fn=ocr_image,
+        inputs=[image_input],
         outputs=[output],
         api_name="/ocr",
     )