#!/usr/bin/env python3 """OCR a scanned PDF using ocrmypdf (Tesseract backend).""" import argparse, json, sys from pathlib import Path def ocr(input_path, output_path): import ocrmypdf if not output_path: output_path = str(Path(input_path).with_stem(Path(input_path).stem + '_ocr')) result = ocrmypdf.ocr( input_path, output_path, language='eng', skip_text=True, # Skip pages that already have text deskew=True, optimize=1, progress_bar=False, ) return output_path def main(): parser = argparse.ArgumentParser(description='OCR a PDF') parser.add_argument('--input', required=True) parser.add_argument('--output', required=True) args = parser.parse_args() try: result = ocr(args.input, args.output) print(json.dumps({"success": True, "output": result, "message": "OCR completed successfully"})) except Exception as e: print(json.dumps({"success": False, "output": "", "message": str(e)})) sys.exit(1) if __name__ == '__main__': main()