Spaces:
Sleeping
Sleeping
File size: 1,125 Bytes
80a3675 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | #!/usr/bin/env python3
"""OCR a scanned PDF using ocrmypdf (Tesseract backend)."""
import argparse, json, sys
from pathlib import Path
def ocr(input_path, output_path):
import ocrmypdf
if not output_path:
output_path = str(Path(input_path).with_stem(Path(input_path).stem + '_ocr'))
result = ocrmypdf.ocr(
input_path, output_path,
language='eng',
skip_text=True, # Skip pages that already have text
deskew=True,
optimize=1,
progress_bar=False,
)
return output_path
def main():
parser = argparse.ArgumentParser(description='OCR a PDF')
parser.add_argument('--input', required=True)
parser.add_argument('--output', required=True)
args = parser.parse_args()
try:
result = ocr(args.input, args.output)
print(json.dumps({"success": True, "output": result, "message": "OCR completed successfully"}))
except Exception as e:
print(json.dumps({"success": False, "output": "", "message": str(e)}))
sys.exit(1)
if __name__ == '__main__':
main()
|