""" Production Urdu OCR entry point: PDF and batch image support. - PDF input: convert each page to image (pypdfium2), run existing YOLOv8 + UTRNet pipeline per page. - Batch input: process all images in a folder (jpg/png) through the same pipeline. - Output: single UTF-8 text file with clear page/image separators. Reuses urdu_ocr_pipeline.run_urdu_ocr. No new ML, no GUI, no cloud. Python only. """ from __future__ import annotations import argparse import sys import tempfile from pathlib import Path from typing import Iterator from urdu_ocr_pipeline import run_urdu_ocr # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- DEFAULT_OUTPUT = "result.txt" IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png"} SEPARATOR_WIDTH = 60 def _pdf_pages_to_images(pdf_path: Path) -> tuple[list[Path], object]: """ Convert each PDF page to a PNG image using pypdfium2 (lightweight, no poppler). Renders each page into a temporary directory. Returns (list of image paths, temp_dir). Caller must keep temp_dir alive while using the paths; use as context manager. """ try: import pypdfium2 as pdfium except ImportError: raise ImportError( "PDF support requires pypdfium2. Install with: pip install pypdfium2" ) from None pdf = pdfium.PdfDocument(str(pdf_path)) n_pages = len(pdf) temp_dir = tempfile.TemporaryDirectory(prefix="urdu_ocr_pdf_") temp_path = Path(temp_dir.name) paths = [] for i in range(n_pages): page = pdf.get_page(i) pil_image = page.render_topil( scale=2.0, # 2x for better OCR on small text rotation=0, colour=(255, 255, 255, 255), ) out = temp_path / f"page_{i + 1:04d}.png" pil_image.save(str(out)) paths.append(out) pdf.close() return paths, temp_dir def _collect_images_from_folder(folder: Path) -> list[Path]: """Collect jpg/png paths from folder, sorted by name for stable order.""" if not folder.is_dir(): return [] paths = [ p for p in folder.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS ] paths.sort(key=lambda p: p.name.lower()) return paths def _write_combined_output( output_path: Path, items: list[tuple[str, list[str]]], ) -> None: """ Write all recognized text to a single UTF-8 file with clear separators. items: list of (label, lines) e.g. ("Page 1 (doc.pdf)", ["line1", "line2"]) Preserves page and image boundaries for traceability. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: for label, lines in items: sep = "=" * SEPARATOR_WIDTH f.write(f"{sep}\n") f.write(f" {label}\n") f.write(f"{sep}\n\n") for line in lines: f.write(line + "\n") f.write("\n") return def _process_input( input_path: Path, verbose: bool, ) -> Iterator[tuple[str, list[str]]]: """ Yield (label, lines) for each page/image from input_path. input_path can be a PDF file or a directory of images. Skips unreadable items and continues; raises only on fatal errors. """ path = Path(input_path).resolve() if path.is_file(): if path.suffix.lower() == ".pdf": # PDF: convert each page to image (pypdfium2), then run pipeline per page try: page_paths, _pdf_temp = _pdf_pages_to_images(path) except Exception as e: print(f"Error: Could not read PDF: {e}", file=sys.stderr) return # Keep _pdf_temp alive until all pages are processed (temp dir cleanup on exit) for i, page_path in enumerate(page_paths): label = f"Page {i + 1} / {len(page_paths)} ({path.name})" try: lines = run_urdu_ocr( page_path, output_dir=path.parent / f"_pdf_pages_{path.stem}" / f"page_{i + 1}", verbose=verbose, ) except Exception as e: print(f"Warning: Skipping {label}: {e}", file=sys.stderr) lines = [f"[Skipped: {e}]"] yield (label, lines) return # Single image label = path.name try: lines = run_urdu_ocr(path, verbose=verbose) except Exception as e: print(f"Warning: Skipping {path.name}: {e}", file=sys.stderr) lines = [f"[Skipped: {e}]"] yield (label, lines) return if path.is_dir(): # Batch: all images in folder image_paths = _collect_images_from_folder(path) if not image_paths: print(f"Warning: No jpg/png images found in {path}", file=sys.stderr) return for i, img_path in enumerate(image_paths): label = f"{i + 1} / {len(image_paths)}: {img_path.name}" try: lines = run_urdu_ocr(img_path, verbose=verbose) except Exception as e: print(f"Warning: Skipping {img_path.name}: {e}", file=sys.stderr) lines = [f"[Skipped: {e}]"] yield (label, lines) return print(f"Error: Input not found (file or directory): {path}", file=sys.stderr) def run( input_path: str | Path, output_path: str | Path = DEFAULT_OUTPUT, *, verbose: bool = True, ) -> Path: """ Run Urdu OCR on input (PDF or image folder), write combined text to output_path. Skips unreadable images/pages and continues. No crash on single-item failure. """ input_path = Path(input_path) output_path = Path(output_path) items = list(_process_input(input_path, verbose=verbose)) if not items: print("No pages or images processed.", file=sys.stderr) return output_path _write_combined_output(output_path, items) return output_path # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def _main() -> None: parser = argparse.ArgumentParser( description="Urdu OCR: PDF or folder of images → single UTF-8 text file (YOLOv8 + UTRNet).", ) parser.add_argument( "--input", "-i", type=Path, required=True, metavar="PATH", help="Input: path to a PDF file or to a folder of images (jpg/png).", ) parser.add_argument( "--output", "-o", type=Path, default=DEFAULT_OUTPUT, metavar="FILE", help=f"Output UTF-8 text file (default: {DEFAULT_OUTPUT}).", ) parser.add_argument( "--quiet", "-q", action="store_true", help="Reduce per-image progress (detection count, recognition steps).", ) args = parser.parse_args() if not args.input.exists(): print(f"Error: Input not found: {args.input}", file=sys.stderr) sys.exit(1) out = run( args.input, args.output, verbose=not args.quiet, ) print(f"Output written to: {out}") if __name__ == "__main__": _main()