historical-ocr / simple_test.py
milwright's picture
Upload historical-ocr v1.1
e99f9b5 verified
raw
history blame
2.23 kB
#!/usr/bin/env python3
"""
Simple test script for structured_ocr.py
"""
import os
import sys
import json
from pathlib import Path
def main():
print("Testing OCR with a sample image file")
# Path to the sample image file
image_path = os.path.join("input", "recipe.jpg")
# Check if the file exists
if not os.path.isfile(image_path):
print(f"Error: Image file not found at {image_path}")
return
print(f"File found: {image_path}")
# Create the output directory if it doesn't exist
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "recipe_test.json")
# Import the StructuredOCR class
from structured_ocr import StructuredOCR
# Initialize OCR processor
processor = StructuredOCR()
try:
# Process the image file
print(f"Processing image file: {image_path}")
result = processor.process_file(image_path, file_type="image")
# Save the result to the output file
with open(output_path, 'w') as f:
json.dump(result, f, indent=2)
print(f"Image processing completed successfully. Output saved to {output_path}")
# Check if the output file exists
if os.path.isfile(output_path):
print(f"Output file exists at {output_path}")
# Print the file size
file_size = os.path.getsize(output_path)
print(f"Output file size: {file_size} bytes")
# Print a preview of the output file
print("\nPreview of output file:")
with open(output_path, 'r') as f:
data = json.load(f)
print(f"File name: {data.get('file_name', '')}")
print(f"Topics: {', '.join(data.get('topics', []))}")
print(f"Languages: {', '.join(data.get('languages', []))}")
print("OCR contents keys:", list(data.get('ocr_contents', {}).keys()))
else:
print(f"Error: Output file not found at {output_path}")
except Exception as e:
print(f"Error processing image: {e}")
if __name__ == "__main__":
main()