Spaces:

milwright
/

historical-ocr

Running

File size: 1,258 Bytes

e99f9b5

#!/usr/bin/env python3
"""
Test script for pdf_ocr.py
"""

from pdf_ocr import PDFOCR
import json
import os

def main():
    # Initialize PDF processor
    processor = PDFOCR()
    
    # Define input and output paths
    pdf_path = "input/rubric.pdf"
    output_path = "output/rubric_test.json"
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Process PDF and save output
    print(f"Processing PDF: {pdf_path}")
    processor.save_json_output(pdf_path, output_path)
    print(f"Output saved to: {output_path}")
    
    # Read and print the output
    with open(output_path, 'r') as f:
        result = json.load(f)
    
    print("\nOutput preview:")
    print(f"File name: {result.get('file_name')}")
    print(f"Topics: {result.get('topics')}")
    print(f"Languages: {result.get('languages')}")
    print("OCR contents preview (first few keys):")
    ocr_contents = result.get('ocr_contents', {})
    for i, (key, value) in enumerate(ocr_contents.items()):
        if i >= 3:  # Only show first 3 keys
            break
        print(f"  {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f"  {key}: {value}")

if __name__ == "__main__":
    main()