ERP-DocIQ / backend /app /ocr /ocr_engine.py
kenmandal's picture
Deploy ERP-DocIQ: agentic OCR + IDP (MiniCPM-V 8B, Tesseract)
32b00ed verified
Raw
History Blame Contribute Delete
4.18 kB
"""Stage 1b — image OCR channel.
Real OCR via Tesseract (pytesseract) or EasyOCR when installed. When neither is
present, a transparent *sidecar fallback* reads `<stem>.txt` next to the document
(produced by the sample generator) so the offline demo still exercises the OCR
channel end-to-end. The fallback is clearly labelled `engine="sidecar-fallback"`
and sets `simulated=True` so nothing is silently faked.
"""
from __future__ import annotations
import importlib.util
import os
import shutil
from dataclasses import dataclass
from pathlib import Path
from .layout import Block, ChannelResult
def _has(mod: str) -> bool:
return importlib.util.find_spec(mod) is not None
def tesseract_available() -> bool:
# The tesseract binary is what matters; we drive it via the CLI (the pytesseract
# Python binding has a stderr-decode bug on some systems), so pytesseract is optional.
return shutil.which("tesseract") is not None
def easyocr_available() -> bool:
return _has("easyocr")
@dataclass
class OCRResult(ChannelResult):
simulated: bool = False
def run_ocr(images: list, source_path: str | Path) -> OCRResult:
"""OCR a list of PIL page images. Falls back to the sidecar text file."""
if images and tesseract_available():
return _ocr_tesseract(images)
if images and easyocr_available():
return _ocr_easyocr(images)
return _ocr_sidecar(source_path)
def _ocr_tesseract(images: list) -> OCRResult:
"""Drive the tesseract CLI via stdin (robust + sandbox-safe — no temp files)."""
import io
import subprocess
exe = shutil.which("tesseract")
parts, blocks = [], []
for pno, img in enumerate(images):
buf = io.BytesIO()
img.save(buf, format="PNG")
try:
out = subprocess.run([exe, "stdin", "stdout"], input=buf.getvalue(),
capture_output=True, timeout=120)
page_text = out.stdout.decode("utf-8", errors="ignore")
except Exception:
page_text = ""
parts.append(page_text)
for w in page_text.split():
blocks.append(Block(text=w, page=pno, source="ocr", confidence=0.8))
text = "\n".join(parts)
return OCRResult(text=text, blocks=blocks, pages=len(images),
available=bool(text.strip()), engine="tesseract")
def _ocr_easyocr(images: list) -> OCRResult:
import easyocr
import numpy as np
reader = easyocr.Reader(["en"], gpu=False, verbose=False)
parts, blocks = [], []
for pno, img in enumerate(images):
results = reader.readtext(np.array(img))
words = []
for bbox, text, conf in results:
words.append(text)
xs = [p[0] for p in bbox]
ys = [p[1] for p in bbox]
blocks.append(
Block(text=text, page=pno,
bbox=(min(xs), min(ys), max(xs), max(ys)),
source="ocr", confidence=float(conf))
)
parts.append(" ".join(words))
text = "\n".join(parts)
return OCRResult(text=text, blocks=blocks, pages=len(images),
available=bool(text.strip()), engine="easyocr")
def _ocr_sidecar(source_path: str | Path) -> OCRResult:
"""Fallback: read the `<stem>.txt` sidecar (the text the page would OCR to).
This keeps the demo working with zero OCR deps. It is explicitly labelled as
simulated so the dashboard can show 'install Tesseract for real OCR'.
"""
p = Path(source_path)
sidecar = p.with_suffix(".txt")
if not sidecar.exists():
# try `<stem>.ocr.txt`
alt = p.with_name(p.stem + ".ocr.txt")
sidecar = alt if alt.exists() else sidecar
if sidecar.exists():
text = sidecar.read_text(encoding="utf-8", errors="ignore")
blocks = [Block(text=w, page=0, source="ocr", confidence=0.85)
for w in text.split()]
return OCRResult(text=text, blocks=blocks, pages=1,
available=bool(text.strip()), engine="sidecar-fallback",
simulated=True)
return OCRResult(available=False, engine="unavailable", simulated=True)