Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Test script for pdf_ocr.py | |
| """ | |
| from pdf_ocr import PDFOCR | |
| import json | |
| import os | |
| def main(): | |
| # Initialize PDF processor | |
| processor = PDFOCR() | |
| # Define input and output paths | |
| pdf_path = "input/rubric.pdf" | |
| output_path = "output/rubric_test.json" | |
| # Create output directory if it doesn't exist | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| # Process PDF and save output | |
| print(f"Processing PDF: {pdf_path}") | |
| processor.save_json_output(pdf_path, output_path) | |
| print(f"Output saved to: {output_path}") | |
| # Read and print the output | |
| with open(output_path, 'r') as f: | |
| result = json.load(f) | |
| print("\nOutput preview:") | |
| print(f"File name: {result.get('file_name')}") | |
| print(f"Topics: {result.get('topics')}") | |
| print(f"Languages: {result.get('languages')}") | |
| print("OCR contents preview (first few keys):") | |
| ocr_contents = result.get('ocr_contents', {}) | |
| for i, (key, value) in enumerate(ocr_contents.items()): | |
| if i >= 3: # Only show first 3 keys | |
| break | |
| print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}") | |
| if __name__ == "__main__": | |
| main() |