Spaces:

Eyob-Sol
/

OCR

Sleeping

App Files Files Community

OCR / ocrpkg /core.py

Eyob-Sol

Upload 15 files

9a5a8ff verified 10 months ago

Raw

History Blame Contribute Delete

5.6 kB

	from __future__ import annotations
	import json
	from pathlib import Path
	from typing import Iterable, List, Tuple

	import click
	import numpy as np
	from PIL import Image

	from .models import get_reader, get_paddle_reader
	from .utils import preprocess, quad_to_bbox
	from .schema import OCRBlock
	from .pdf import pdf_to_images

	# Supported extensions
	IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".webp", ".tif", ".tiff", ".gif"}
	PDF_EXTS = {".pdf"}


	# ---------------- EasyOCR (base) ----------------
	def run_ocr_on_image(
	img: Image.Image,
	langs: Iterable[str] = ("en",),
	conf_threshold: float = 0.3,
	page: int = 1,
	) -> List[OCRBlock]:
	"""Run EasyOCR on a single PIL image and return structured blocks."""
	reader = get_reader(tuple(langs))
	img_prep = preprocess(img)
	results = reader.readtext(np.array(img_prep), detail=1, paragraph=False) # [quad, text, conf]

	blocks: List[OCRBlock] = []
	for quad, text, conf in results:
	if conf is None or conf < conf_threshold or not str(text).strip():
	continue
	bbox = quad_to_bbox(quad)
	blocks.append(OCRBlock(page=page, bbox=bbox, text=str(text), confidence=float(conf)))
	return blocks


	# ---------------- PaddleOCR (high quality) ----------------
	def run_ocr_on_image_paddle(
	img: Image.Image,
	lang: str = "en",
	conf_threshold: float = 0.3,
	page: int = 1,
	) -> List[OCRBlock]:
	"""
	Run PaddleOCR (det + rec) on a PIL image and return OCRBlocks.
	"""
	import cv2
	ocr = get_paddle_reader(lang)
	arr = cv2.cvtColor(np.array(img.convert("RGB")), cv2.COLOR_RGB2BGR)
	result = ocr.ocr(arr, cls=True)

	blocks: List[OCRBlock] = []
	for line in result:
	for det in line:
	quad, (text, conf) = det
	if conf is None or conf < conf_threshold or not str(text).strip():
	continue
	xs = [int(p[0]) for p in quad]
	ys = [int(p[1]) for p in quad]
	bbox = (min(xs), min(ys), max(xs), max(ys))
	blocks.append(OCRBlock(page=page, bbox=bbox, text=str(text), confidence=float(conf)))
	return blocks


	# ---------------- File routing ----------------
	def render_input_to_pages(path: Path, dpi: int = 200) -> List[Image.Image]:
	"""Convert a file (image or PDF) into a list of PIL pages."""
	if path.suffix.lower() in PDF_EXTS:
	return pdf_to_images(path, dpi=dpi)
	elif path.suffix.lower() in IMAGE_EXTS:
	return [Image.open(path).convert("RGB")]
	else:
	raise ValueError(f"Unsupported file type: {path.suffix}")


	def ocr_file(
	input_path: Path,
	langs: Iterable[str] = ("en",),
	dpi: int = 200,
	conf_threshold: float = 0.3,
	) -> List[OCRBlock]:
	"""Main OCR entrypoint for one file using EasyOCR (default)."""
	pages = render_input_to_pages(input_path, dpi=dpi)
	all_blocks: List[OCRBlock] = []
	for i, page_img in enumerate(pages, start=1):
	blocks = run_ocr_on_image(page_img, langs=langs, conf_threshold=conf_threshold, page=i)
	all_blocks.extend(blocks)
	return all_blocks


	# ---------------- Save helpers ----------------
	def save_json(blocks: List[OCRBlock], out_path: Path) -> None:
	"""Save OCR results to JSON."""
	out_path.parent.mkdir(parents=True, exist_ok=True)
	data = [b.model_dump() for b in blocks]
	out_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")


	def save_csv(blocks: List[OCRBlock], out_path: Path) -> None:
	"""Save OCR results to CSV (via pandas)."""
	import pandas as pd
	out_path.parent.mkdir(parents=True, exist_ok=True)
	rows = []
	for b in blocks:
	x1, y1, x2, y2 = b.bbox
	rows.append(
	{
	"page": b.page,
	"x1": x1,
	"y1": y1,
	"x2": x2,
	"y2": y2,
	"text": b.text,
	"confidence": b.confidence,
	}
	)
	pd.DataFrame(rows).to_csv(out_path, index=False)


	# ---------------- CLI ----------------
	@click.command(context_settings=dict(help_option_names=["-h", "--help"]))
	@click.argument("input_path", type=click.Path(exists=True, path_type=Path))
	@click.argument("output_dir", type=click.Path(path_type=Path))
	@click.option("--lang", "langs", multiple=True, default=["en"], show_default=True,
	help="Languages for EasyOCR (e.g., en, fr, de)")
	@click.option("--dpi", default=200, show_default=True, help="PDF render DPI")
	@click.option("--conf-threshold", default=0.3, show_default=True,
	help="Min confidence to keep a block")
	def main(input_path: Path, output_dir: Path, langs: list[str], dpi: int, conf_threshold: float):
	"""Run OCR on a file or a folder recursively, save JSON + CSV results (EasyOCR)."""
	inputs: list[Path] = []
	if input_path.is_dir():
	for p in input_path.rglob("*"):
	if p.suffix.lower() in IMAGE_EXTS.union(PDF_EXTS):
	inputs.append(p)
	else:
	inputs = [input_path]

	output_dir.mkdir(parents=True, exist_ok=True)
	for p in inputs:
	try:
	blocks = ocr_file(p, langs=langs, dpi=dpi, conf_threshold=conf_threshold)
	base = p.stem
	json_out = output_dir / f"{base}.json"
	csv_out = output_dir / f"{base}.csv"
	save_json(blocks, json_out)
	save_csv(blocks, csv_out)
	click.echo(f"[OK] {p} -> {json_out.name}, {csv_out.name}")
	except Exception as e:
	click.echo(f"[ERR] {p}: {e}", err=True)


	if __name__ == "__main__":
	main()