Spaces:

Zarm33na
/

bilingual-ocr-api

Sleeping

App Files Files Community

bilingual-ocr-api / urdu_ocr.py

Zarm33na

Initial deployment: bilingual OCR API (Urdu + English)

04f9475 3 months ago

raw

history blame contribute delete

7.44 kB

	"""
	Production Urdu OCR entry point: PDF and batch image support.

	- PDF input: convert each page to image (pypdfium2), run existing YOLOv8 + UTRNet pipeline per page.
	- Batch input: process all images in a folder (jpg/png) through the same pipeline.
	- Output: single UTF-8 text file with clear page/image separators.

	Reuses urdu_ocr_pipeline.run_urdu_ocr. No new ML, no GUI, no cloud. Python only.
	"""

	from __future__ import annotations

	import argparse
	import sys
	import tempfile
	from pathlib import Path
	from typing import Iterator

	from urdu_ocr_pipeline import run_urdu_ocr

	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------
	DEFAULT_OUTPUT = "result.txt"
	IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png"}
	SEPARATOR_WIDTH = 60


	def _pdf_pages_to_images(pdf_path: Path) -> tuple[list[Path], object]:
	"""
	Convert each PDF page to a PNG image using pypdfium2 (lightweight, no poppler).

	Renders each page into a temporary directory. Returns (list of image paths, temp_dir).
	Caller must keep temp_dir alive while using the paths; use as context manager.
	"""
	try:
	import pypdfium2 as pdfium
	except ImportError:
	raise ImportError(
	"PDF support requires pypdfium2. Install with: pip install pypdfium2"
	) from None

	pdf = pdfium.PdfDocument(str(pdf_path))
	n_pages = len(pdf)
	temp_dir = tempfile.TemporaryDirectory(prefix="urdu_ocr_pdf_")
	temp_path = Path(temp_dir.name)
	paths = []
	for i in range(n_pages):
	page = pdf.get_page(i)
	pil_image = page.render_topil(
	scale=2.0, # 2x for better OCR on small text
	rotation=0,
	colour=(255, 255, 255, 255),
	)
	out = temp_path / f"page_{i + 1:04d}.png"
	pil_image.save(str(out))
	paths.append(out)
	pdf.close()
	return paths, temp_dir


	def _collect_images_from_folder(folder: Path) -> list[Path]:
	"""Collect jpg/png paths from folder, sorted by name for stable order."""
	if not folder.is_dir():
	return []
	paths = [
	p
	for p in folder.iterdir()
	if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS
	]
	paths.sort(key=lambda p: p.name.lower())
	return paths


	def _write_combined_output(
	output_path: Path,
	items: list[tuple[str, list[str]]],
	) -> None:
	"""
	Write all recognized text to a single UTF-8 file with clear separators.

	items: list of (label, lines) e.g. ("Page 1 (doc.pdf)", ["line1", "line2"])
	Preserves page and image boundaries for traceability.
	"""
	output_path = Path(output_path)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	with open(output_path, "w", encoding="utf-8") as f:
	for label, lines in items:
	sep = "=" * SEPARATOR_WIDTH
	f.write(f"{sep}\n")
	f.write(f" {label}\n")
	f.write(f"{sep}\n\n")
	for line in lines:
	f.write(line + "\n")
	f.write("\n")
	return


	def _process_input(
	input_path: Path,
	verbose: bool,
	) -> Iterator[tuple[str, list[str]]]:
	"""
	Yield (label, lines) for each page/image from input_path.

	input_path can be a PDF file or a directory of images.
	Skips unreadable items and continues; raises only on fatal errors.
	"""
	path = Path(input_path).resolve()
	if path.is_file():
	if path.suffix.lower() == ".pdf":
	# PDF: convert each page to image (pypdfium2), then run pipeline per page
	try:
	page_paths, _pdf_temp = _pdf_pages_to_images(path)
	except Exception as e:
	print(f"Error: Could not read PDF: {e}", file=sys.stderr)
	return
	# Keep _pdf_temp alive until all pages are processed (temp dir cleanup on exit)
	for i, page_path in enumerate(page_paths):
	label = f"Page {i + 1} / {len(page_paths)} ({path.name})"
	try:
	lines = run_urdu_ocr(
	page_path,
	output_dir=path.parent / f"_pdf_pages_{path.stem}" / f"page_{i + 1}",
	verbose=verbose,
	)
	except Exception as e:
	print(f"Warning: Skipping {label}: {e}", file=sys.stderr)
	lines = [f"[Skipped: {e}]"]
	yield (label, lines)
	return
	# Single image
	label = path.name
	try:
	lines = run_urdu_ocr(path, verbose=verbose)
	except Exception as e:
	print(f"Warning: Skipping {path.name}: {e}", file=sys.stderr)
	lines = [f"[Skipped: {e}]"]
	yield (label, lines)
	return

	if path.is_dir():
	# Batch: all images in folder
	image_paths = _collect_images_from_folder(path)
	if not image_paths:
	print(f"Warning: No jpg/png images found in {path}", file=sys.stderr)
	return
	for i, img_path in enumerate(image_paths):
	label = f"{i + 1} / {len(image_paths)}: {img_path.name}"
	try:
	lines = run_urdu_ocr(img_path, verbose=verbose)
	except Exception as e:
	print(f"Warning: Skipping {img_path.name}: {e}", file=sys.stderr)
	lines = [f"[Skipped: {e}]"]
	yield (label, lines)
	return

	print(f"Error: Input not found (file or directory): {path}", file=sys.stderr)


	def run(
	input_path: str \| Path,
	output_path: str \| Path = DEFAULT_OUTPUT,
	*,
	verbose: bool = True,
	) -> Path:
	"""
	Run Urdu OCR on input (PDF or image folder), write combined text to output_path.

	Skips unreadable images/pages and continues. No crash on single-item failure.
	"""
	input_path = Path(input_path)
	output_path = Path(output_path)
	items = list(_process_input(input_path, verbose=verbose))
	if not items:
	print("No pages or images processed.", file=sys.stderr)
	return output_path
	_write_combined_output(output_path, items)
	return output_path


	# ---------------------------------------------------------------------------
	# CLI
	# ---------------------------------------------------------------------------
	def _main() -> None:
	parser = argparse.ArgumentParser(
	description="Urdu OCR: PDF or folder of images → single UTF-8 text file (YOLOv8 + UTRNet).",
	)
	parser.add_argument(
	"--input",
	"-i",
	type=Path,
	required=True,
	metavar="PATH",
	help="Input: path to a PDF file or to a folder of images (jpg/png).",
	)
	parser.add_argument(
	"--output",
	"-o",
	type=Path,
	default=DEFAULT_OUTPUT,
	metavar="FILE",
	help=f"Output UTF-8 text file (default: {DEFAULT_OUTPUT}).",
	)
	parser.add_argument(
	"--quiet",
	"-q",
	action="store_true",
	help="Reduce per-image progress (detection count, recognition steps).",
	)
	args = parser.parse_args()

	if not args.input.exists():
	print(f"Error: Input not found: {args.input}", file=sys.stderr)
	sys.exit(1)

	out = run(
	args.input,
	args.output,
	verbose=not args.quiet,
	)
	print(f"Output written to: {out}")


	if __name__ == "__main__":
	_main()