Spaces:

Didier
/

Docling_VLM_OCR

Running

App Files Files Community

Didier commited on Feb 28

Commit

58ba391

verified ·

1 Parent(s): d069605

Upload vlm_ocr.py

Browse files

Files changed (1) hide show

vlm_ocr.py +162 -0

vlm_ocr.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+File: vlm_ocr.py
+This module provides a VLM OCR model for Docling.
+:author: Didier Guillevic
+:email: didier.guillevic@gmail.com
+:date: 2026-02-27
+:license: Apache License 2.0
+"""
+import base64
+import io
+import logging
+import requests
+import itertools
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, ClassVar, List, Literal, Optional, Type
+from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import OcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling_core.types.doc.page import BoundingRectangle, TextCell
+from PIL import Image
+_log = logging.getLogger(__name__)
+_cancel_requested = False
+def request_cancel():
+    global _cancel_requested
+    _cancel_requested = True
+def reset_cancel():
+    global _cancel_requested
+    _cancel_requested = False
+class VlmOcrOptions(OcrOptions):
+    kind: ClassVar[Literal["vlm_ocr"]] = "vlm_ocr"
+    lang: List[str] = ["en"]
+    model: str = "Ministral-3-14B-Instruct-2512"
+    openai_base_url: str = "http://localhost:8080/v1"
+    openai_api_key: str = "Keep learning"
+    prompt: str = "Transcribe the text in this image. Return only the transcription. Use standard Markdown table syntax for any tables found. Be extremely accurate."
+    timeout: float = 300.0
+class VlmOcrModel(BaseOcrModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        options: VlmOcrOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(
+            enabled=enabled,
+            artifacts_path=artifacts_path,
+            options=options,
+            accelerator_options=accelerator_options,
+        )
+        self.options: VlmOcrOptions = options
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+        for page in page_batch:
+            if _cancel_requested:
+                _log.info("OCR execution cancelled.")
+                yield page
+                continue
+            if page._backend is None or not page._backend.is_valid():
+                yield page
+                continue
+            # Identify OCR regions
+            ocr_rects = self.get_ocr_rects(page)
+            all_ocr_cells = []
+            for i, ocr_rect in enumerate(ocr_rects):
+                if ocr_rect.area() == 0:
+                    continue
+                # Get the image for the region
+                high_res_image = page._backend.get_page_image(
+                    scale=3.0, cropbox=ocr_rect
+                )
+                # Convert PIL Image to Base64
+                buffered = io.BytesIO()
+                high_res_image.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                # Call OpenAI-compatible API
+                payload = {
+                    "model": self.options.model,
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": self.options.prompt},
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/png;base64,{img_str}"},
+                                },
+                            ],
+                        }
+                    ],
+                    "temperature": 0.0,
+                }
+                headers = {"Authorization": f"Bearer {self.options.openai_api_key}"}
+                endpoint = f"{self.options.openai_base_url.rstrip('/')}/chat/completions"
+                try:
+                    _log.info(f"Sending VLM OCR request for page {page.page_no}, region {i}")
+                    response = requests.post(
+                        endpoint,
+                        json=payload,
+                        headers=headers,
+                        timeout=self.options.timeout,
+                    )
+                    response.raise_for_status()
+                    result = response.json()
+                    transcription = result["choices"][0]["message"]["content"]
+                    cell = TextCell(
+                        index=len(all_ocr_cells),
+                        text=transcription,
+                        orig=transcription,
+                        from_ocr=True,
+                        confidence=1.0,
+                        rect=BoundingRectangle.from_bounding_box(ocr_rect),
+                    )
+                    all_ocr_cells.append(cell)
+                except Exception as e:
+                    _log.error(f"VLM OCR failed for page {page.page_no}: {e}")
+            # Post-process the cells
+            self.post_process_cells(all_ocr_cells, page)
+            yield page
+    @classmethod
+    def get_options_type(cls) -> Type[OcrOptions]:
+        return VlmOcrOptions
+class LocalVlmPdfPipeline(StandardPdfPipeline):
+    def _make_ocr_model(self, art_path: Path | None) -> Any:
+        if isinstance(self.pipeline_options.ocr_options, VlmOcrOptions):
+            return VlmOcrModel(
+                enabled=self.pipeline_options.do_ocr,
+                artifacts_path=art_path,
+                options=self.pipeline_options.ocr_options,
+                accelerator_options=self.pipeline_options.accelerator_options,
+            )
+        return super()._make_ocr_model(art_path)