Spaces:

chitrark
/

olmocr-api

Sleeping

App Files Files Community

chitrark commited on Dec 19, 2025

Commit

c70f99d

verified ·

1 Parent(s): c8aa7db

updated with different change for OOM issue

Browse files

Files changed (1) hide show

app.py +59 -51

app.py CHANGED Viewed

@@ -5,61 +5,73 @@ import torch
 from PIL import Image
 import gradio as gr
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
-# ----- Model & processor -----
 MODEL_NAME = "allenai/olmOCR-2-7B-1025"
-PROCESSOR_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print("Loading model on", device)
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_NAME,
-    dtype=torch.bfloat16,  # use bfloat16 on GPU; change to torch.float16 if needed
-).to(device).eval()
-processor = AutoProcessor.from_pretrained(PROCESSOR_NAME)
-# ----- OCR logic (image -> text) -----
 def build_image_prompt(width: int, height: int) -> str:
     """
-    Simple document-style prompt for a single image page.
-    You can tweak wording; this keeps it close to olmOCR's 'document' framing.
     """
     return (
-        "You are an OCR engine. Read the document page shown in the image and "
-        "return the plain text exactly as it appears, in natural reading order. "
-        "Do not add extra commentary or formatting.\n"
         "RAW_TEXT_START\n"
         f"Page dimensions: {width:.1f}x{height:.1f} [Image 0x0 to {width:.1f}x{height:.1f}]\n"
         "RAW_TEXT_END"
     )
-def ocr_image(image: Image.Image):
-    if image is None:
         return "No image uploaded."
-    # Ensure RGB
-    img = image.convert("RGB")
-    # Resize to keep longest side <= 1024 for efficiency
-    max_side = 1024
     w, h = img.size
-    scale = min(max_side / max(w, h), 1.0)
-    if scale < 1.0:
-        img = img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
-        w, h = img.size
-    # Encode to base64 to match usual VLM 'image_url' style usage
     buf = BytesIO()
     img.save(buf, format="PNG")
-    image_bytes = buf.getvalue()
-    image_b64 = base64.b64encode(image_bytes).decode("utf-8")
     prompt = build_image_prompt(w, h)
@@ -76,45 +88,42 @@ def ocr_image(image: Image.Image):
         }
     ]
-    # Apply chat template and preprocess
-    text = processor.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True,
     )
     inputs = processor(
-        text=[text],
         images=[img],
         padding=True,
         return_tensors="pt",
     )
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        output = model.generate(
             **inputs,
-            temperature=0.6,
             max_new_tokens=512,
-            num_return_sequences=1,
-            do_sample=True,
         )
     prompt_len = inputs["input_ids"].shape[1]
-    new_tokens = output[:, prompt_len:]
-    text_output = processor.tokenizer.batch_decode(
-        new_tokens, skip_special_tokens=True
-    )
-    return text_output[0].strip() if text_output else "No text extracted."
-# ----- Gradio UI -----
-with gr.Blocks(title="olmOCR‑2 Image OCR") as demo:
     gr.Markdown(
-        "# olmOCR‑2 Image OCR\n"
-        "Upload an image and get extracted text using the olmOCR‑2‑7B model."
     )
     image_input = gr.Image(type="pil", label="Upload image")
@@ -127,5 +136,4 @@ with gr.Blocks(title="olmOCR‑2 Image OCR") as demo:
         api_name="/ocr",
     )
-if __name__ == "__main__":
-    demo.queue().launch()

 from PIL import Image
 import gradio as gr
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+# Model + processor
 MODEL_NAME = "allenai/olmOCR-2-7B-1025"
+PROCESSOR_NAME = "Qwen/Qwen2-VL-7B-Instruct"
+# Lazy-loaded globals (so Space boots faster)
+processor = None
+model = None
+def load_model():
+    """Load processor + model once. Use device_map='auto' to fit on T4."""
+    global processor, model
+    if processor is not None and model is not None:
+        return
+    processor = AutoProcessor.from_pretrained(PROCESSOR_NAME)
+    # T4 is happiest with fp16; device_map="auto" avoids full VRAM load.
+    model = Qwen2VLForConditionalGeneration.from_pretrained(
+        MODEL_NAME,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        low_cpu_mem_usage=True,
+    ).eval()
 def build_image_prompt(width: int, height: int) -> str:
     """
+    Simple document-anchored OCR prompt.
+    Keep it short to reduce prompt tokens + hallucination risk.
     """
     return (
+        "Extract all readable text from this page image.\n"
+        "Return ONLY the extracted text (no explanations, no markdown).\n"
+        "Do not hallucinate.\n"
         "RAW_TEXT_START\n"
         f"Page dimensions: {width:.1f}x{height:.1f} [Image 0x0 to {width:.1f}x{height:.1f}]\n"
         "RAW_TEXT_END"
     )
+def _resize_max_side(img: Image.Image, max_side: int = 896) -> Image.Image:
+    """Resize to keep inference stable on T4."""
+    w, h = img.size
+    m = max(w, h)
+    if m <= max_side:
+        return img
+    scale = max_side / m
+    return img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
+def ocr_image(img: Image.Image):
+    if img is None:
         return "No image uploaded."
+    load_model()
+    img = img.convert("RGB")
+    img = _resize_max_side(img, max_side=896)
     w, h = img.size
+    # Encode to base64 for image_url-style messages
     buf = BytesIO()
     img.save(buf, format="PNG")
+    image_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
     prompt = build_image_prompt(w, h)
         }
     ]
+    # Build chat text
+    chat_text = processor.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True,
     )
     inputs = processor(
+        text=[chat_text],
         images=[img],
         padding=True,
         return_tensors="pt",
     )
+    # NOTE: DO NOT .to("cuda") here when using device_map="auto"
+    # transformers will handle placement.
+    with torch.inference_mode():
+        output_ids = model.generate(
             **inputs,
             max_new_tokens=512,
+            do_sample=False,  # OCR should be deterministic
         )
+    # Remove the prompt tokens to keep only the generated part
     prompt_len = inputs["input_ids"].shape[1]
+    gen_ids = output_ids[:, prompt_len:]
+    text_out = processor.tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
+    return text_out[0].strip() if text_out else "No text extracted."
+with gr.Blocks(title="BookReader OCR API (olmOCR2)") as demo:
     gr.Markdown(
+        "# BookReader OCR API (olmOCR2)\n"
+        "Upload an image → get extracted text.\n\n"
+        "**API endpoint:** `/ocr`"
     )
     image_input = gr.Image(type="pil", label="Upload image")
         api_name="/ocr",
     )
+demo.queue().launch()