OCR / ocrpkg /core.py
Eyob-Sol's picture
Upload 15 files
9a5a8ff verified
Raw
History Blame Contribute Delete
5.6 kB
from __future__ import annotations
import json
from pathlib import Path
from typing import Iterable, List, Tuple
import click
import numpy as np
from PIL import Image
from .models import get_reader, get_paddle_reader
from .utils import preprocess, quad_to_bbox
from .schema import OCRBlock
from .pdf import pdf_to_images
# Supported extensions
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".webp", ".tif", ".tiff", ".gif"}
PDF_EXTS = {".pdf"}
# ---------------- EasyOCR (base) ----------------
def run_ocr_on_image(
img: Image.Image,
langs: Iterable[str] = ("en",),
conf_threshold: float = 0.3,
page: int = 1,
) -> List[OCRBlock]:
"""Run EasyOCR on a single PIL image and return structured blocks."""
reader = get_reader(tuple(langs))
img_prep = preprocess(img)
results = reader.readtext(np.array(img_prep), detail=1, paragraph=False) # [quad, text, conf]
blocks: List[OCRBlock] = []
for quad, text, conf in results:
if conf is None or conf < conf_threshold or not str(text).strip():
continue
bbox = quad_to_bbox(quad)
blocks.append(OCRBlock(page=page, bbox=bbox, text=str(text), confidence=float(conf)))
return blocks
# ---------------- PaddleOCR (high quality) ----------------
def run_ocr_on_image_paddle(
img: Image.Image,
lang: str = "en",
conf_threshold: float = 0.3,
page: int = 1,
) -> List[OCRBlock]:
"""
Run PaddleOCR (det + rec) on a PIL image and return OCRBlocks.
"""
import cv2
ocr = get_paddle_reader(lang)
arr = cv2.cvtColor(np.array(img.convert("RGB")), cv2.COLOR_RGB2BGR)
result = ocr.ocr(arr, cls=True)
blocks: List[OCRBlock] = []
for line in result:
for det in line:
quad, (text, conf) = det
if conf is None or conf < conf_threshold or not str(text).strip():
continue
xs = [int(p[0]) for p in quad]
ys = [int(p[1]) for p in quad]
bbox = (min(xs), min(ys), max(xs), max(ys))
blocks.append(OCRBlock(page=page, bbox=bbox, text=str(text), confidence=float(conf)))
return blocks
# ---------------- File routing ----------------
def render_input_to_pages(path: Path, dpi: int = 200) -> List[Image.Image]:
"""Convert a file (image or PDF) into a list of PIL pages."""
if path.suffix.lower() in PDF_EXTS:
return pdf_to_images(path, dpi=dpi)
elif path.suffix.lower() in IMAGE_EXTS:
return [Image.open(path).convert("RGB")]
else:
raise ValueError(f"Unsupported file type: {path.suffix}")
def ocr_file(
input_path: Path,
langs: Iterable[str] = ("en",),
dpi: int = 200,
conf_threshold: float = 0.3,
) -> List[OCRBlock]:
"""Main OCR entrypoint for one file using EasyOCR (default)."""
pages = render_input_to_pages(input_path, dpi=dpi)
all_blocks: List[OCRBlock] = []
for i, page_img in enumerate(pages, start=1):
blocks = run_ocr_on_image(page_img, langs=langs, conf_threshold=conf_threshold, page=i)
all_blocks.extend(blocks)
return all_blocks
# ---------------- Save helpers ----------------
def save_json(blocks: List[OCRBlock], out_path: Path) -> None:
"""Save OCR results to JSON."""
out_path.parent.mkdir(parents=True, exist_ok=True)
data = [b.model_dump() for b in blocks]
out_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
def save_csv(blocks: List[OCRBlock], out_path: Path) -> None:
"""Save OCR results to CSV (via pandas)."""
import pandas as pd
out_path.parent.mkdir(parents=True, exist_ok=True)
rows = []
for b in blocks:
x1, y1, x2, y2 = b.bbox
rows.append(
{
"page": b.page,
"x1": x1,
"y1": y1,
"x2": x2,
"y2": y2,
"text": b.text,
"confidence": b.confidence,
}
)
pd.DataFrame(rows).to_csv(out_path, index=False)
# ---------------- CLI ----------------
@click.command(context_settings=dict(help_option_names=["-h", "--help"]))
@click.argument("input_path", type=click.Path(exists=True, path_type=Path))
@click.argument("output_dir", type=click.Path(path_type=Path))
@click.option("--lang", "langs", multiple=True, default=["en"], show_default=True,
help="Languages for EasyOCR (e.g., en, fr, de)")
@click.option("--dpi", default=200, show_default=True, help="PDF render DPI")
@click.option("--conf-threshold", default=0.3, show_default=True,
help="Min confidence to keep a block")
def main(input_path: Path, output_dir: Path, langs: list[str], dpi: int, conf_threshold: float):
"""Run OCR on a file or a folder recursively, save JSON + CSV results (EasyOCR)."""
inputs: list[Path] = []
if input_path.is_dir():
for p in input_path.rglob("*"):
if p.suffix.lower() in IMAGE_EXTS.union(PDF_EXTS):
inputs.append(p)
else:
inputs = [input_path]
output_dir.mkdir(parents=True, exist_ok=True)
for p in inputs:
try:
blocks = ocr_file(p, langs=langs, dpi=dpi, conf_threshold=conf_threshold)
base = p.stem
json_out = output_dir / f"{base}.json"
csv_out = output_dir / f"{base}.csv"
save_json(blocks, json_out)
save_csv(blocks, csv_out)
click.echo(f"[OK] {p} -> {json_out.name}, {csv_out.name}")
except Exception as e:
click.echo(f"[ERR] {p}: {e}", err=True)
if __name__ == "__main__":
main()