Spaces:
Running
Running
| """Stage 1b — image OCR channel. | |
| Real OCR via Tesseract (pytesseract) or EasyOCR when installed. When neither is | |
| present, a transparent *sidecar fallback* reads `<stem>.txt` next to the document | |
| (produced by the sample generator) so the offline demo still exercises the OCR | |
| channel end-to-end. The fallback is clearly labelled `engine="sidecar-fallback"` | |
| and sets `simulated=True` so nothing is silently faked. | |
| """ | |
| from __future__ import annotations | |
| import importlib.util | |
| import os | |
| import shutil | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from .layout import Block, ChannelResult | |
| def _has(mod: str) -> bool: | |
| return importlib.util.find_spec(mod) is not None | |
| def tesseract_available() -> bool: | |
| # The tesseract binary is what matters; we drive it via the CLI (the pytesseract | |
| # Python binding has a stderr-decode bug on some systems), so pytesseract is optional. | |
| return shutil.which("tesseract") is not None | |
| def easyocr_available() -> bool: | |
| return _has("easyocr") | |
| class OCRResult(ChannelResult): | |
| simulated: bool = False | |
| def run_ocr(images: list, source_path: str | Path) -> OCRResult: | |
| """OCR a list of PIL page images. Falls back to the sidecar text file.""" | |
| if images and tesseract_available(): | |
| return _ocr_tesseract(images) | |
| if images and easyocr_available(): | |
| return _ocr_easyocr(images) | |
| return _ocr_sidecar(source_path) | |
| def _ocr_tesseract(images: list) -> OCRResult: | |
| """Drive the tesseract CLI via stdin (robust + sandbox-safe — no temp files).""" | |
| import io | |
| import subprocess | |
| exe = shutil.which("tesseract") | |
| parts, blocks = [], [] | |
| for pno, img in enumerate(images): | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| try: | |
| out = subprocess.run([exe, "stdin", "stdout"], input=buf.getvalue(), | |
| capture_output=True, timeout=120) | |
| page_text = out.stdout.decode("utf-8", errors="ignore") | |
| except Exception: | |
| page_text = "" | |
| parts.append(page_text) | |
| for w in page_text.split(): | |
| blocks.append(Block(text=w, page=pno, source="ocr", confidence=0.8)) | |
| text = "\n".join(parts) | |
| return OCRResult(text=text, blocks=blocks, pages=len(images), | |
| available=bool(text.strip()), engine="tesseract") | |
| def _ocr_easyocr(images: list) -> OCRResult: | |
| import easyocr | |
| import numpy as np | |
| reader = easyocr.Reader(["en"], gpu=False, verbose=False) | |
| parts, blocks = [], [] | |
| for pno, img in enumerate(images): | |
| results = reader.readtext(np.array(img)) | |
| words = [] | |
| for bbox, text, conf in results: | |
| words.append(text) | |
| xs = [p[0] for p in bbox] | |
| ys = [p[1] for p in bbox] | |
| blocks.append( | |
| Block(text=text, page=pno, | |
| bbox=(min(xs), min(ys), max(xs), max(ys)), | |
| source="ocr", confidence=float(conf)) | |
| ) | |
| parts.append(" ".join(words)) | |
| text = "\n".join(parts) | |
| return OCRResult(text=text, blocks=blocks, pages=len(images), | |
| available=bool(text.strip()), engine="easyocr") | |
| def _ocr_sidecar(source_path: str | Path) -> OCRResult: | |
| """Fallback: read the `<stem>.txt` sidecar (the text the page would OCR to). | |
| This keeps the demo working with zero OCR deps. It is explicitly labelled as | |
| simulated so the dashboard can show 'install Tesseract for real OCR'. | |
| """ | |
| p = Path(source_path) | |
| sidecar = p.with_suffix(".txt") | |
| if not sidecar.exists(): | |
| # try `<stem>.ocr.txt` | |
| alt = p.with_name(p.stem + ".ocr.txt") | |
| sidecar = alt if alt.exists() else sidecar | |
| if sidecar.exists(): | |
| text = sidecar.read_text(encoding="utf-8", errors="ignore") | |
| blocks = [Block(text=w, page=0, source="ocr", confidence=0.85) | |
| for w in text.split()] | |
| return OCRResult(text=text, blocks=blocks, pages=1, | |
| available=bool(text.strip()), engine="sidecar-fallback", | |
| simulated=True) | |
| return OCRResult(available=False, engine="unavailable", simulated=True) | |