Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """OCR a scanned PDF using ocrmypdf (Tesseract backend).""" | |
| import argparse, json, sys | |
| from pathlib import Path | |
| def ocr(input_path, output_path): | |
| import ocrmypdf | |
| if not output_path: | |
| output_path = str(Path(input_path).with_stem(Path(input_path).stem + '_ocr')) | |
| result = ocrmypdf.ocr( | |
| input_path, output_path, | |
| language='eng', | |
| skip_text=True, # Skip pages that already have text | |
| deskew=True, | |
| optimize=1, | |
| progress_bar=False, | |
| ) | |
| return output_path | |
| def main(): | |
| parser = argparse.ArgumentParser(description='OCR a PDF') | |
| parser.add_argument('--input', required=True) | |
| parser.add_argument('--output', required=True) | |
| args = parser.parse_args() | |
| try: | |
| result = ocr(args.input, args.output) | |
| print(json.dumps({"success": True, "output": result, "message": "OCR completed successfully"})) | |
| except Exception as e: | |
| print(json.dumps({"success": False, "output": "", "message": str(e)})) | |
| sys.exit(1) | |
| if __name__ == '__main__': | |
| main() | |