ERP-DocIQ

Running

App Files Files Community

ERP-DocIQ / backend /app /ocr /ocr_engine.py

kenmandal

Deploy ERP-DocIQ: agentic OCR + IDP (MiniCPM-V 8B, Tesseract)

32b00ed verified 27 days ago

Raw

History Blame Contribute Delete

4.18 kB

	"""Stage 1b — image OCR channel.

	Real OCR via Tesseract (pytesseract) or EasyOCR when installed. When neither is
	present, a transparent sidecar fallback reads `<stem>.txt` next to the document
	(produced by the sample generator) so the offline demo still exercises the OCR
	channel end-to-end. The fallback is clearly labelled `engine="sidecar-fallback"`
	and sets `simulated=True` so nothing is silently faked.
	"""
	from __future__ import annotations

	import importlib.util
	import os
	import shutil
	from dataclasses import dataclass
	from pathlib import Path

	from .layout import Block, ChannelResult


	def _has(mod: str) -> bool:
	return importlib.util.find_spec(mod) is not None


	def tesseract_available() -> bool:
	# The tesseract binary is what matters; we drive it via the CLI (the pytesseract
	# Python binding has a stderr-decode bug on some systems), so pytesseract is optional.
	return shutil.which("tesseract") is not None


	def easyocr_available() -> bool:
	return _has("easyocr")


	@dataclass
	class OCRResult(ChannelResult):
	simulated: bool = False


	def run_ocr(images: list, source_path: str \| Path) -> OCRResult:
	"""OCR a list of PIL page images. Falls back to the sidecar text file."""
	if images and tesseract_available():
	return _ocr_tesseract(images)
	if images and easyocr_available():
	return _ocr_easyocr(images)
	return _ocr_sidecar(source_path)


	def _ocr_tesseract(images: list) -> OCRResult:
	"""Drive the tesseract CLI via stdin (robust + sandbox-safe — no temp files)."""
	import io
	import subprocess

	exe = shutil.which("tesseract")
	parts, blocks = [], []
	for pno, img in enumerate(images):
	buf = io.BytesIO()
	img.save(buf, format="PNG")
	try:
	out = subprocess.run([exe, "stdin", "stdout"], input=buf.getvalue(),
	capture_output=True, timeout=120)
	page_text = out.stdout.decode("utf-8", errors="ignore")
	except Exception:
	page_text = ""
	parts.append(page_text)
	for w in page_text.split():
	blocks.append(Block(text=w, page=pno, source="ocr", confidence=0.8))
	text = "\n".join(parts)
	return OCRResult(text=text, blocks=blocks, pages=len(images),
	available=bool(text.strip()), engine="tesseract")


	def _ocr_easyocr(images: list) -> OCRResult:
	import easyocr
	import numpy as np

	reader = easyocr.Reader(["en"], gpu=False, verbose=False)
	parts, blocks = [], []
	for pno, img in enumerate(images):
	results = reader.readtext(np.array(img))
	words = []
	for bbox, text, conf in results:
	words.append(text)
	xs = [p[0] for p in bbox]
	ys = [p[1] for p in bbox]
	blocks.append(
	Block(text=text, page=pno,
	bbox=(min(xs), min(ys), max(xs), max(ys)),
	source="ocr", confidence=float(conf))
	)
	parts.append(" ".join(words))
	text = "\n".join(parts)
	return OCRResult(text=text, blocks=blocks, pages=len(images),
	available=bool(text.strip()), engine="easyocr")


	def _ocr_sidecar(source_path: str \| Path) -> OCRResult:
	"""Fallback: read the `<stem>.txt` sidecar (the text the page would OCR to).

	This keeps the demo working with zero OCR deps. It is explicitly labelled as
	simulated so the dashboard can show 'install Tesseract for real OCR'.
	"""
	p = Path(source_path)
	sidecar = p.with_suffix(".txt")
	if not sidecar.exists():
	# try `<stem>.ocr.txt`
	alt = p.with_name(p.stem + ".ocr.txt")
	sidecar = alt if alt.exists() else sidecar
	if sidecar.exists():
	text = sidecar.read_text(encoding="utf-8", errors="ignore")
	blocks = [Block(text=w, page=0, source="ocr", confidence=0.85)
	for w in text.split()]
	return OCRResult(text=text, blocks=blocks, pages=1,
	available=bool(text.strip()), engine="sidecar-fallback",
	simulated=True)
	return OCRResult(available=False, engine="unavailable", simulated=True)