from __future__ import annotations import json from pathlib import Path from typing import Iterable, List, Tuple import click import numpy as np from PIL import Image from .models import get_reader, get_paddle_reader from .utils import preprocess, quad_to_bbox from .schema import OCRBlock from .pdf import pdf_to_images # Supported extensions IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".webp", ".tif", ".tiff", ".gif"} PDF_EXTS = {".pdf"} # ---------------- EasyOCR (base) ---------------- def run_ocr_on_image( img: Image.Image, langs: Iterable[str] = ("en",), conf_threshold: float = 0.3, page: int = 1, ) -> List[OCRBlock]: """Run EasyOCR on a single PIL image and return structured blocks.""" reader = get_reader(tuple(langs)) img_prep = preprocess(img) results = reader.readtext(np.array(img_prep), detail=1, paragraph=False) # [quad, text, conf] blocks: List[OCRBlock] = [] for quad, text, conf in results: if conf is None or conf < conf_threshold or not str(text).strip(): continue bbox = quad_to_bbox(quad) blocks.append(OCRBlock(page=page, bbox=bbox, text=str(text), confidence=float(conf))) return blocks # ---------------- PaddleOCR (high quality) ---------------- def run_ocr_on_image_paddle( img: Image.Image, lang: str = "en", conf_threshold: float = 0.3, page: int = 1, ) -> List[OCRBlock]: """ Run PaddleOCR (det + rec) on a PIL image and return OCRBlocks. """ import cv2 ocr = get_paddle_reader(lang) arr = cv2.cvtColor(np.array(img.convert("RGB")), cv2.COLOR_RGB2BGR) result = ocr.ocr(arr, cls=True) blocks: List[OCRBlock] = [] for line in result: for det in line: quad, (text, conf) = det if conf is None or conf < conf_threshold or not str(text).strip(): continue xs = [int(p[0]) for p in quad] ys = [int(p[1]) for p in quad] bbox = (min(xs), min(ys), max(xs), max(ys)) blocks.append(OCRBlock(page=page, bbox=bbox, text=str(text), confidence=float(conf))) return blocks # ---------------- File routing ---------------- def render_input_to_pages(path: Path, dpi: int = 200) -> List[Image.Image]: """Convert a file (image or PDF) into a list of PIL pages.""" if path.suffix.lower() in PDF_EXTS: return pdf_to_images(path, dpi=dpi) elif path.suffix.lower() in IMAGE_EXTS: return [Image.open(path).convert("RGB")] else: raise ValueError(f"Unsupported file type: {path.suffix}") def ocr_file( input_path: Path, langs: Iterable[str] = ("en",), dpi: int = 200, conf_threshold: float = 0.3, ) -> List[OCRBlock]: """Main OCR entrypoint for one file using EasyOCR (default).""" pages = render_input_to_pages(input_path, dpi=dpi) all_blocks: List[OCRBlock] = [] for i, page_img in enumerate(pages, start=1): blocks = run_ocr_on_image(page_img, langs=langs, conf_threshold=conf_threshold, page=i) all_blocks.extend(blocks) return all_blocks # ---------------- Save helpers ---------------- def save_json(blocks: List[OCRBlock], out_path: Path) -> None: """Save OCR results to JSON.""" out_path.parent.mkdir(parents=True, exist_ok=True) data = [b.model_dump() for b in blocks] out_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") def save_csv(blocks: List[OCRBlock], out_path: Path) -> None: """Save OCR results to CSV (via pandas).""" import pandas as pd out_path.parent.mkdir(parents=True, exist_ok=True) rows = [] for b in blocks: x1, y1, x2, y2 = b.bbox rows.append( { "page": b.page, "x1": x1, "y1": y1, "x2": x2, "y2": y2, "text": b.text, "confidence": b.confidence, } ) pd.DataFrame(rows).to_csv(out_path, index=False) # ---------------- CLI ---------------- @click.command(context_settings=dict(help_option_names=["-h", "--help"])) @click.argument("input_path", type=click.Path(exists=True, path_type=Path)) @click.argument("output_dir", type=click.Path(path_type=Path)) @click.option("--lang", "langs", multiple=True, default=["en"], show_default=True, help="Languages for EasyOCR (e.g., en, fr, de)") @click.option("--dpi", default=200, show_default=True, help="PDF render DPI") @click.option("--conf-threshold", default=0.3, show_default=True, help="Min confidence to keep a block") def main(input_path: Path, output_dir: Path, langs: list[str], dpi: int, conf_threshold: float): """Run OCR on a file or a folder recursively, save JSON + CSV results (EasyOCR).""" inputs: list[Path] = [] if input_path.is_dir(): for p in input_path.rglob("*"): if p.suffix.lower() in IMAGE_EXTS.union(PDF_EXTS): inputs.append(p) else: inputs = [input_path] output_dir.mkdir(parents=True, exist_ok=True) for p in inputs: try: blocks = ocr_file(p, langs=langs, dpi=dpi, conf_threshold=conf_threshold) base = p.stem json_out = output_dir / f"{base}.json" csv_out = output_dir / f"{base}.csv" save_json(blocks, json_out) save_csv(blocks, csv_out) click.echo(f"[OK] {p} -> {json_out.name}, {csv_out.name}") except Exception as e: click.echo(f"[ERR] {p}: {e}", err=True) if __name__ == "__main__": main()