Spaces:

DocUA
/

LightOnOCR-1B-Demo

Sleeping

App Files Files Community

LightOnOCR-1B-Demo / ocr_cli.py

DocUA

feat: update ggml kernels, webui components, model templates, and build configurations

eb133b8 10 days ago

raw

history blame contribute delete

5.26 kB

	#!/usr/bin/env python3
	"""
	OCR CLI utility for LightOnOCR-1B with backend support.
	Supports PyTorch and GGUF backends for flexible performance/quality trade-offs.
	"""

	import os
	import sys
	import argparse
	import time
	from pathlib import Path
	from PIL import Image
	import pypdfium2 as pdfium

	# Add project root to path
	sys.path.insert(0, str(Path(__file__).parent))

	from backends import create_backend, get_available_backends


	def render_pdf_page(page, scale=2.0):
	"""Render PDF page to PIL Image with configurable scale."""
	return page.render(scale=scale, rev_byteorder=True).to_pil()


	def process_file(input_path: str, backend_name: str = "pytorch", scale: float = 2.0,
	temperature: float = 0.1, max_tokens: int = 1024):
	"""
	Process PDF or image file with OCR.

	Args:
	input_path: Path to input file
	backend_name: "pytorch" or "gguf"
	scale: PDF rendering scale (lower = faster, higher = better quality)
	temperature: Sampling temperature for generation
	max_tokens: Maximum tokens to generate (lower = faster)
	"""
	input_path = Path(input_path).resolve()
	if not input_path.exists():
	print(f"Error: File {input_path} not found.")
	return

	# Create backend
	print(f"Initializing {backend_name} backend...")
	backend = create_backend(backend_name)
	backend.load_model()

	info = backend.get_backend_info()
	print(f"Backend info: {info}")

	# Load images
	images = []
	if input_path.suffix.lower() == '.pdf':
	print(f"\nProcessing PDF: {input_path.name}")
	pdf = pdfium.PdfDocument(str(input_path))
	num_pages = len(pdf)
	print(f" Total pages: {num_pages}")
	print(f" Rendering scale: {scale}x")

	for i in range(num_pages):
	print(f" Rendering page {i+1}/{num_pages}...", end=" ")
	start = time.time()
	images.append(render_pdf_page(pdf[i], scale=scale))
	print(f"({time.time() - start:.1f}s)")
	pdf.close()
	else:
	print(f"Processing image: {input_path.name}")
	images = [Image.open(input_path)]

	# Process with OCR
	all_texts = []
	total_start = time.time()

	for i, img in enumerate(images):
	print(f"\n OCR on page {i+1}/{len(images)}...", end=" ")
	start = time.time()

	try:
	text = backend.process_image(img, temperature=temperature, max_tokens=max_tokens)
	elapsed = time.time() - start

	all_texts.append(text)
	print(f"({elapsed:.1f}s, {len(text)} chars)")
	print(f" Preview: {text[:80]}...")
	except Exception as e:
	print(f"ERROR: {e}")
	all_texts.append(f"[Error processing page {i+1}: {e}]")

	# Save results
	final_output = "\n\n".join(all_texts)
	output_path = input_path.with_suffix('.md')
	output_path.write_text(final_output, encoding='utf-8')

	total_time = time.time() - total_start
	print(f"\n✓ OCR Complete!")
	print(f" Total time: {total_time:.1f}s ({total_time/len(images):.1f}s per page)")
	print(f" Output: {output_path}")


	def main():
	parser = argparse.ArgumentParser(
	description="OCR utility for LightOnOCR-1B with backend selection",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Process with PyTorch (default, best quality)
	python ocr_cli.py document.pdf

	# Process with GGUF (faster, requires llama-cpp-python)
	python ocr_cli.py document.pdf --backend gguf

	# Fast processing with lower resolution
	python ocr_cli.py document.pdf --scale 1.5

	# High quality with higher resolution
	python ocr_cli.py document.pdf --scale 3.0
	"""
	)

	parser.add_argument(
	"input_file",
	nargs="?",
	default="test_docs/Xerox Scan_11062025151244_unident.pdf",
	help="Input PDF or image file (default: test PDF)"
	)

	parser.add_argument(
	"--backend",
	choices=get_available_backends(),
	default="pytorch",
	help="Backend to use for inference (default: pytorch)"
	)

	parser.add_argument(
	"--scale",
	type=float,
	default=2.0,
	help="PDF rendering scale (default: 2.0, range: 1.0-4.0)"
	)

	parser.add_argument(
	"--temperature",
	type=float,
	default=0.1,
	help="Sampling temperature (default: 0.1, 0=greedy)"
	)

	parser.add_argument(
	"--max-tokens",
	type=int,
	default=1024,
	help="Maximum tokens to generate (default: 1024, range: 256-2048)"
	)

	args = parser.parse_args()

	# Validate scale
	if not 1.0 <= args.scale <= 4.0:
	print("Warning: Scale should be between 1.0 and 4.0")

	try:
	process_file(
	args.input_file,
	backend_name=args.backend,
	scale=args.scale,
	temperature=args.temperature,
	max_tokens=args.max_tokens
	)
	except Exception as e:
	print(f"\nFatal error: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)


	if __name__ == "__main__":
	main()