Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / simple_test.py

milwright

Upload historical-ocr v1.1

e99f9b5 verified 11 months ago

raw

history blame

2.23 kB

	#!/usr/bin/env python3
	"""
	Simple test script for structured_ocr.py
	"""

	import os
	import sys
	import json
	from pathlib import Path

	def main():
	print("Testing OCR with a sample image file")

	# Path to the sample image file
	image_path = os.path.join("input", "recipe.jpg")

	# Check if the file exists
	if not os.path.isfile(image_path):
	print(f"Error: Image file not found at {image_path}")
	return

	print(f"File found: {image_path}")

	# Create the output directory if it doesn't exist
	output_dir = "output"
	os.makedirs(output_dir, exist_ok=True)

	output_path = os.path.join(output_dir, "recipe_test.json")

	# Import the StructuredOCR class
	from structured_ocr import StructuredOCR

	# Initialize OCR processor
	processor = StructuredOCR()

	try:
	# Process the image file
	print(f"Processing image file: {image_path}")
	result = processor.process_file(image_path, file_type="image")

	# Save the result to the output file
	with open(output_path, 'w') as f:
	json.dump(result, f, indent=2)

	print(f"Image processing completed successfully. Output saved to {output_path}")

	# Check if the output file exists
	if os.path.isfile(output_path):
	print(f"Output file exists at {output_path}")
	# Print the file size
	file_size = os.path.getsize(output_path)
	print(f"Output file size: {file_size} bytes")

	# Print a preview of the output file
	print("\nPreview of output file:")
	with open(output_path, 'r') as f:
	data = json.load(f)
	print(f"File name: {data.get('file_name', '')}")
	print(f"Topics: {', '.join(data.get('topics', []))}")
	print(f"Languages: {', '.join(data.get('languages', []))}")
	print("OCR contents keys:", list(data.get('ocr_contents', {}).keys()))
	else:
	print(f"Error: Output file not found at {output_path}")

	except Exception as e:
	print(f"Error processing image: {e}")

	if __name__ == "__main__":
	main()