Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| OCR CLI utility for LightOnOCR-1B with backend support. | |
| Supports PyTorch and GGUF backends for flexible performance/quality trade-offs. | |
| """ | |
| import os | |
| import sys | |
| import argparse | |
| import time | |
| from pathlib import Path | |
| from PIL import Image | |
| import pypdfium2 as pdfium | |
| # Add project root to path | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| from backends import create_backend, get_available_backends | |
| def render_pdf_page(page, scale=2.0): | |
| """Render PDF page to PIL Image with configurable scale.""" | |
| return page.render(scale=scale, rev_byteorder=True).to_pil() | |
| def process_file(input_path: str, backend_name: str = "pytorch", scale: float = 2.0, | |
| temperature: float = 0.1, max_tokens: int = 1024): | |
| """ | |
| Process PDF or image file with OCR. | |
| Args: | |
| input_path: Path to input file | |
| backend_name: "pytorch" or "gguf" | |
| scale: PDF rendering scale (lower = faster, higher = better quality) | |
| temperature: Sampling temperature for generation | |
| max_tokens: Maximum tokens to generate (lower = faster) | |
| """ | |
| input_path = Path(input_path).resolve() | |
| if not input_path.exists(): | |
| print(f"Error: File {input_path} not found.") | |
| return | |
| # Create backend | |
| print(f"Initializing {backend_name} backend...") | |
| backend = create_backend(backend_name) | |
| backend.load_model() | |
| info = backend.get_backend_info() | |
| print(f"Backend info: {info}") | |
| # Load images | |
| images = [] | |
| if input_path.suffix.lower() == '.pdf': | |
| print(f"\nProcessing PDF: {input_path.name}") | |
| pdf = pdfium.PdfDocument(str(input_path)) | |
| num_pages = len(pdf) | |
| print(f" Total pages: {num_pages}") | |
| print(f" Rendering scale: {scale}x") | |
| for i in range(num_pages): | |
| print(f" Rendering page {i+1}/{num_pages}...", end=" ") | |
| start = time.time() | |
| images.append(render_pdf_page(pdf[i], scale=scale)) | |
| print(f"({time.time() - start:.1f}s)") | |
| pdf.close() | |
| else: | |
| print(f"Processing image: {input_path.name}") | |
| images = [Image.open(input_path)] | |
| # Process with OCR | |
| all_texts = [] | |
| total_start = time.time() | |
| for i, img in enumerate(images): | |
| print(f"\n OCR on page {i+1}/{len(images)}...", end=" ") | |
| start = time.time() | |
| try: | |
| text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens) | |
| elapsed = time.time() - start | |
| all_texts.append(text) | |
| print(f"({elapsed:.1f}s, {len(text)} chars)") | |
| print(f" Preview: {text[:80]}...") | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| all_texts.append(f"[Error processing page {i+1}: {e}]") | |
| # Save results | |
| final_output = "\n\n".join(all_texts) | |
| output_path = input_path.with_suffix('.md') | |
| output_path.write_text(final_output, encoding='utf-8') | |
| total_time = time.time() - total_start | |
| print(f"\n✓ OCR Complete!") | |
| print(f" Total time: {total_time:.1f}s ({total_time/len(images):.1f}s per page)") | |
| print(f" Output: {output_path}") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="OCR utility for LightOnOCR-1B with backend selection", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Process with PyTorch (default, best quality) | |
| python ocr_cli.py document.pdf | |
| # Process with GGUF (faster, requires llama-cpp-python) | |
| python ocr_cli.py document.pdf --backend gguf | |
| # Fast processing with lower resolution | |
| python ocr_cli.py document.pdf --scale 1.5 | |
| # High quality with higher resolution | |
| python ocr_cli.py document.pdf --scale 3.0 | |
| """ | |
| ) | |
| parser.add_argument( | |
| "input_file", | |
| nargs="?", | |
| default="test_docs/Xerox Scan_11062025151244_unident.pdf", | |
| help="Input PDF or image file (default: test PDF)" | |
| ) | |
| parser.add_argument( | |
| "--backend", | |
| choices=get_available_backends(), | |
| default="pytorch", | |
| help="Backend to use for inference (default: pytorch)" | |
| ) | |
| parser.add_argument( | |
| "--scale", | |
| type=float, | |
| default=2.0, | |
| help="PDF rendering scale (default: 2.0, range: 1.0-4.0)" | |
| ) | |
| parser.add_argument( | |
| "--temperature", | |
| type=float, | |
| default=0.1, | |
| help="Sampling temperature (default: 0.1, 0=greedy)" | |
| ) | |
| parser.add_argument( | |
| "--max-tokens", | |
| type=int, | |
| default=1024, | |
| help="Maximum tokens to generate (default: 1024, range: 256-2048)" | |
| ) | |
| args = parser.parse_args() | |
| # Validate scale | |
| if not 1.0 <= args.scale <= 4.0: | |
| print("Warning: Scale should be between 1.0 and 4.0") | |
| try: | |
| process_file( | |
| args.input_file, | |
| backend_name=args.backend, | |
| scale=args.scale, | |
| temperature=args.temperature, | |
| max_tokens=args.max_tokens | |
| ) | |
| except Exception as e: | |
| print(f"\nFatal error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |