Spaces:
Running
Running
File size: 1,258 Bytes
e99f9b5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | #!/usr/bin/env python3
"""
Test script for pdf_ocr.py
"""
from pdf_ocr import PDFOCR
import json
import os
def main():
# Initialize PDF processor
processor = PDFOCR()
# Define input and output paths
pdf_path = "input/rubric.pdf"
output_path = "output/rubric_test.json"
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Process PDF and save output
print(f"Processing PDF: {pdf_path}")
processor.save_json_output(pdf_path, output_path)
print(f"Output saved to: {output_path}")
# Read and print the output
with open(output_path, 'r') as f:
result = json.load(f)
print("\nOutput preview:")
print(f"File name: {result.get('file_name')}")
print(f"Topics: {result.get('topics')}")
print(f"Languages: {result.get('languages')}")
print("OCR contents preview (first few keys):")
ocr_contents = result.get('ocr_contents', {})
for i, (key, value) in enumerate(ocr_contents.items()):
if i >= 3: # Only show first 3 keys
break
print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}")
if __name__ == "__main__":
main() |