LightOnOCR-1B-Demo / ocr_cli.py
DocUA's picture
feat: update ggml kernels, webui components, model templates, and build configurations
eb133b8
#!/usr/bin/env python3
"""
OCR CLI utility for LightOnOCR-1B with backend support.
Supports PyTorch and GGUF backends for flexible performance/quality trade-offs.
"""
import os
import sys
import argparse
import time
from pathlib import Path
from PIL import Image
import pypdfium2 as pdfium
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent))
from backends import create_backend, get_available_backends
def render_pdf_page(page, scale=2.0):
"""Render PDF page to PIL Image with configurable scale."""
return page.render(scale=scale, rev_byteorder=True).to_pil()
def process_file(input_path: str, backend_name: str = "pytorch", scale: float = 2.0,
temperature: float = 0.1, max_tokens: int = 1024):
"""
Process PDF or image file with OCR.
Args:
input_path: Path to input file
backend_name: "pytorch" or "gguf"
scale: PDF rendering scale (lower = faster, higher = better quality)
temperature: Sampling temperature for generation
max_tokens: Maximum tokens to generate (lower = faster)
"""
input_path = Path(input_path).resolve()
if not input_path.exists():
print(f"Error: File {input_path} not found.")
return
# Create backend
print(f"Initializing {backend_name} backend...")
backend = create_backend(backend_name)
backend.load_model()
info = backend.get_backend_info()
print(f"Backend info: {info}")
# Load images
images = []
if input_path.suffix.lower() == '.pdf':
print(f"\nProcessing PDF: {input_path.name}")
pdf = pdfium.PdfDocument(str(input_path))
num_pages = len(pdf)
print(f" Total pages: {num_pages}")
print(f" Rendering scale: {scale}x")
for i in range(num_pages):
print(f" Rendering page {i+1}/{num_pages}...", end=" ")
start = time.time()
images.append(render_pdf_page(pdf[i], scale=scale))
print(f"({time.time() - start:.1f}s)")
pdf.close()
else:
print(f"Processing image: {input_path.name}")
images = [Image.open(input_path)]
# Process with OCR
all_texts = []
total_start = time.time()
for i, img in enumerate(images):
print(f"\n OCR on page {i+1}/{len(images)}...", end=" ")
start = time.time()
try:
text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens)
elapsed = time.time() - start
all_texts.append(text)
print(f"({elapsed:.1f}s, {len(text)} chars)")
print(f" Preview: {text[:80]}...")
except Exception as e:
print(f"ERROR: {e}")
all_texts.append(f"[Error processing page {i+1}: {e}]")
# Save results
final_output = "\n\n".join(all_texts)
output_path = input_path.with_suffix('.md')
output_path.write_text(final_output, encoding='utf-8')
total_time = time.time() - total_start
print(f"\n✓ OCR Complete!")
print(f" Total time: {total_time:.1f}s ({total_time/len(images):.1f}s per page)")
print(f" Output: {output_path}")
def main():
parser = argparse.ArgumentParser(
description="OCR utility for LightOnOCR-1B with backend selection",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Process with PyTorch (default, best quality)
python ocr_cli.py document.pdf
# Process with GGUF (faster, requires llama-cpp-python)
python ocr_cli.py document.pdf --backend gguf
# Fast processing with lower resolution
python ocr_cli.py document.pdf --scale 1.5
# High quality with higher resolution
python ocr_cli.py document.pdf --scale 3.0
"""
)
parser.add_argument(
"input_file",
nargs="?",
default="test_docs/Xerox Scan_11062025151244_unident.pdf",
help="Input PDF or image file (default: test PDF)"
)
parser.add_argument(
"--backend",
choices=get_available_backends(),
default="pytorch",
help="Backend to use for inference (default: pytorch)"
)
parser.add_argument(
"--scale",
type=float,
default=2.0,
help="PDF rendering scale (default: 2.0, range: 1.0-4.0)"
)
parser.add_argument(
"--temperature",
type=float,
default=0.1,
help="Sampling temperature (default: 0.1, 0=greedy)"
)
parser.add_argument(
"--max-tokens",
type=int,
default=1024,
help="Maximum tokens to generate (default: 1024, range: 256-2048)"
)
args = parser.parse_args()
# Validate scale
if not 1.0 <= args.scale <= 4.0:
print("Warning: Scale should be between 1.0 and 4.0")
try:
process_file(
args.input_file,
backend_name=args.backend,
scale=args.scale,
temperature=args.temperature,
max_tokens=args.max_tokens
)
except Exception as e:
print(f"\nFatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()