Spaces:
Sleeping
Sleeping
| """ | |
| Production Urdu OCR entry point: PDF and batch image support. | |
| - PDF input: convert each page to image (pypdfium2), run existing YOLOv8 + UTRNet pipeline per page. | |
| - Batch input: process all images in a folder (jpg/png) through the same pipeline. | |
| - Output: single UTF-8 text file with clear page/image separators. | |
| Reuses urdu_ocr_pipeline.run_urdu_ocr. No new ML, no GUI, no cloud. Python only. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Iterator | |
| from urdu_ocr_pipeline import run_urdu_ocr | |
| # --------------------------------------------------------------------------- | |
| # Constants | |
| # --------------------------------------------------------------------------- | |
| DEFAULT_OUTPUT = "result.txt" | |
| IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png"} | |
| SEPARATOR_WIDTH = 60 | |
| def _pdf_pages_to_images(pdf_path: Path) -> tuple[list[Path], object]: | |
| """ | |
| Convert each PDF page to a PNG image using pypdfium2 (lightweight, no poppler). | |
| Renders each page into a temporary directory. Returns (list of image paths, temp_dir). | |
| Caller must keep temp_dir alive while using the paths; use as context manager. | |
| """ | |
| try: | |
| import pypdfium2 as pdfium | |
| except ImportError: | |
| raise ImportError( | |
| "PDF support requires pypdfium2. Install with: pip install pypdfium2" | |
| ) from None | |
| pdf = pdfium.PdfDocument(str(pdf_path)) | |
| n_pages = len(pdf) | |
| temp_dir = tempfile.TemporaryDirectory(prefix="urdu_ocr_pdf_") | |
| temp_path = Path(temp_dir.name) | |
| paths = [] | |
| for i in range(n_pages): | |
| page = pdf.get_page(i) | |
| pil_image = page.render_topil( | |
| scale=2.0, # 2x for better OCR on small text | |
| rotation=0, | |
| colour=(255, 255, 255, 255), | |
| ) | |
| out = temp_path / f"page_{i + 1:04d}.png" | |
| pil_image.save(str(out)) | |
| paths.append(out) | |
| pdf.close() | |
| return paths, temp_dir | |
| def _collect_images_from_folder(folder: Path) -> list[Path]: | |
| """Collect jpg/png paths from folder, sorted by name for stable order.""" | |
| if not folder.is_dir(): | |
| return [] | |
| paths = [ | |
| p | |
| for p in folder.iterdir() | |
| if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS | |
| ] | |
| paths.sort(key=lambda p: p.name.lower()) | |
| return paths | |
| def _write_combined_output( | |
| output_path: Path, | |
| items: list[tuple[str, list[str]]], | |
| ) -> None: | |
| """ | |
| Write all recognized text to a single UTF-8 file with clear separators. | |
| items: list of (label, lines) e.g. ("Page 1 (doc.pdf)", ["line1", "line2"]) | |
| Preserves page and image boundaries for traceability. | |
| """ | |
| output_path = Path(output_path) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| for label, lines in items: | |
| sep = "=" * SEPARATOR_WIDTH | |
| f.write(f"{sep}\n") | |
| f.write(f" {label}\n") | |
| f.write(f"{sep}\n\n") | |
| for line in lines: | |
| f.write(line + "\n") | |
| f.write("\n") | |
| return | |
| def _process_input( | |
| input_path: Path, | |
| verbose: bool, | |
| ) -> Iterator[tuple[str, list[str]]]: | |
| """ | |
| Yield (label, lines) for each page/image from input_path. | |
| input_path can be a PDF file or a directory of images. | |
| Skips unreadable items and continues; raises only on fatal errors. | |
| """ | |
| path = Path(input_path).resolve() | |
| if path.is_file(): | |
| if path.suffix.lower() == ".pdf": | |
| # PDF: convert each page to image (pypdfium2), then run pipeline per page | |
| try: | |
| page_paths, _pdf_temp = _pdf_pages_to_images(path) | |
| except Exception as e: | |
| print(f"Error: Could not read PDF: {e}", file=sys.stderr) | |
| return | |
| # Keep _pdf_temp alive until all pages are processed (temp dir cleanup on exit) | |
| for i, page_path in enumerate(page_paths): | |
| label = f"Page {i + 1} / {len(page_paths)} ({path.name})" | |
| try: | |
| lines = run_urdu_ocr( | |
| page_path, | |
| output_dir=path.parent / f"_pdf_pages_{path.stem}" / f"page_{i + 1}", | |
| verbose=verbose, | |
| ) | |
| except Exception as e: | |
| print(f"Warning: Skipping {label}: {e}", file=sys.stderr) | |
| lines = [f"[Skipped: {e}]"] | |
| yield (label, lines) | |
| return | |
| # Single image | |
| label = path.name | |
| try: | |
| lines = run_urdu_ocr(path, verbose=verbose) | |
| except Exception as e: | |
| print(f"Warning: Skipping {path.name}: {e}", file=sys.stderr) | |
| lines = [f"[Skipped: {e}]"] | |
| yield (label, lines) | |
| return | |
| if path.is_dir(): | |
| # Batch: all images in folder | |
| image_paths = _collect_images_from_folder(path) | |
| if not image_paths: | |
| print(f"Warning: No jpg/png images found in {path}", file=sys.stderr) | |
| return | |
| for i, img_path in enumerate(image_paths): | |
| label = f"{i + 1} / {len(image_paths)}: {img_path.name}" | |
| try: | |
| lines = run_urdu_ocr(img_path, verbose=verbose) | |
| except Exception as e: | |
| print(f"Warning: Skipping {img_path.name}: {e}", file=sys.stderr) | |
| lines = [f"[Skipped: {e}]"] | |
| yield (label, lines) | |
| return | |
| print(f"Error: Input not found (file or directory): {path}", file=sys.stderr) | |
| def run( | |
| input_path: str | Path, | |
| output_path: str | Path = DEFAULT_OUTPUT, | |
| *, | |
| verbose: bool = True, | |
| ) -> Path: | |
| """ | |
| Run Urdu OCR on input (PDF or image folder), write combined text to output_path. | |
| Skips unreadable images/pages and continues. No crash on single-item failure. | |
| """ | |
| input_path = Path(input_path) | |
| output_path = Path(output_path) | |
| items = list(_process_input(input_path, verbose=verbose)) | |
| if not items: | |
| print("No pages or images processed.", file=sys.stderr) | |
| return output_path | |
| _write_combined_output(output_path, items) | |
| return output_path | |
| # --------------------------------------------------------------------------- | |
| # CLI | |
| # --------------------------------------------------------------------------- | |
| def _main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Urdu OCR: PDF or folder of images → single UTF-8 text file (YOLOv8 + UTRNet).", | |
| ) | |
| parser.add_argument( | |
| "--input", | |
| "-i", | |
| type=Path, | |
| required=True, | |
| metavar="PATH", | |
| help="Input: path to a PDF file or to a folder of images (jpg/png).", | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| "-o", | |
| type=Path, | |
| default=DEFAULT_OUTPUT, | |
| metavar="FILE", | |
| help=f"Output UTF-8 text file (default: {DEFAULT_OUTPUT}).", | |
| ) | |
| parser.add_argument( | |
| "--quiet", | |
| "-q", | |
| action="store_true", | |
| help="Reduce per-image progress (detection count, recognition steps).", | |
| ) | |
| args = parser.parse_args() | |
| if not args.input.exists(): | |
| print(f"Error: Input not found: {args.input}", file=sys.stderr) | |
| sys.exit(1) | |
| out = run( | |
| args.input, | |
| args.output, | |
| verbose=not args.quiet, | |
| ) | |
| print(f"Output written to: {out}") | |
| if __name__ == "__main__": | |
| _main() | |