#!/usr/bin/env python3 """ OCR CLI utility for LightOnOCR-1B with backend support. Supports PyTorch and GGUF backends for flexible performance/quality trade-offs. """ import os import sys import argparse import time from pathlib import Path from PIL import Image import pypdfium2 as pdfium # Add project root to path sys.path.insert(0, str(Path(__file__).parent)) from backends import create_backend, get_available_backends def render_pdf_page(page, scale=2.0): """Render PDF page to PIL Image with configurable scale.""" return page.render(scale=scale, rev_byteorder=True).to_pil() def process_file(input_path: str, backend_name: str = "pytorch", scale: float = 2.0, temperature: float = 0.1, max_tokens: int = 1024): """ Process PDF or image file with OCR. Args: input_path: Path to input file backend_name: "pytorch" or "gguf" scale: PDF rendering scale (lower = faster, higher = better quality) temperature: Sampling temperature for generation max_tokens: Maximum tokens to generate (lower = faster) """ input_path = Path(input_path).resolve() if not input_path.exists(): print(f"Error: File {input_path} not found.") return # Create backend print(f"Initializing {backend_name} backend...") backend = create_backend(backend_name) backend.load_model() info = backend.get_backend_info() print(f"Backend info: {info}") # Load images images = [] if input_path.suffix.lower() == '.pdf': print(f"\nProcessing PDF: {input_path.name}") pdf = pdfium.PdfDocument(str(input_path)) num_pages = len(pdf) print(f" Total pages: {num_pages}") print(f" Rendering scale: {scale}x") for i in range(num_pages): print(f" Rendering page {i+1}/{num_pages}...", end=" ") start = time.time() images.append(render_pdf_page(pdf[i], scale=scale)) print(f"({time.time() - start:.1f}s)") pdf.close() else: print(f"Processing image: {input_path.name}") images = [Image.open(input_path)] # Process with OCR all_texts = [] total_start = time.time() for i, img in enumerate(images): print(f"\n OCR on page {i+1}/{len(images)}...", end=" ") start = time.time() try: text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens) elapsed = time.time() - start all_texts.append(text) print(f"({elapsed:.1f}s, {len(text)} chars)") print(f" Preview: {text[:80]}...") except Exception as e: print(f"ERROR: {e}") all_texts.append(f"[Error processing page {i+1}: {e}]") # Save results final_output = "\n\n".join(all_texts) output_path = input_path.with_suffix('.md') output_path.write_text(final_output, encoding='utf-8') total_time = time.time() - total_start print(f"\n✓ OCR Complete!") print(f" Total time: {total_time:.1f}s ({total_time/len(images):.1f}s per page)") print(f" Output: {output_path}") def main(): parser = argparse.ArgumentParser( description="OCR utility for LightOnOCR-1B with backend selection", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Process with PyTorch (default, best quality) python ocr_cli.py document.pdf # Process with GGUF (faster, requires llama-cpp-python) python ocr_cli.py document.pdf --backend gguf # Fast processing with lower resolution python ocr_cli.py document.pdf --scale 1.5 # High quality with higher resolution python ocr_cli.py document.pdf --scale 3.0 """ ) parser.add_argument( "input_file", nargs="?", default="test_docs/Xerox Scan_11062025151244_unident.pdf", help="Input PDF or image file (default: test PDF)" ) parser.add_argument( "--backend", choices=get_available_backends(), default="pytorch", help="Backend to use for inference (default: pytorch)" ) parser.add_argument( "--scale", type=float, default=2.0, help="PDF rendering scale (default: 2.0, range: 1.0-4.0)" ) parser.add_argument( "--temperature", type=float, default=0.1, help="Sampling temperature (default: 0.1, 0=greedy)" ) parser.add_argument( "--max-tokens", type=int, default=1024, help="Maximum tokens to generate (default: 1024, range: 256-2048)" ) args = parser.parse_args() # Validate scale if not 1.0 <= args.scale <= 4.0: print("Warning: Scale should be between 1.0 and 4.0") try: process_file( args.input_file, backend_name=args.backend, scale=args.scale, temperature=args.temperature, max_tokens=args.max_tokens ) except Exception as e: print(f"\nFatal error: {e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()