Spaces:

chiruu12
/

Digital-Image-Processing-OCR

Sleeping

Digital-Image-Processing-OCR / src /Cutom_dataset_scripts /inspect_dataset.py

chiruu12

Initial commit of clean OCR application

9543569 7 months ago

1.8 kB

	import os
	import h5py
	import numpy as np

	DATA_FILE = "data/book_dataset.h5"


	def get_file_size(path):
	"""Calculates the size of a single file."""
	size_bytes = os.path.getsize(path)
	if size_bytes > 1024 * 1024 * 1024:
	return f"{size_bytes / (1024 ** 3):.2f} GB"
	elif size_bytes > 1024 * 1024:
	return f"{size_bytes / (1024 ** 2):.2f} MB"
	return f"{size_bytes / 1024:.2f} KB"


	def analyze_hdf5_dataset(file_path):
	"""
	Analyzes the HDF5 character dataset, printing a summary of its contents.
	"""
	if not os.path.exists(file_path):
	print(f"Error: Dataset file not found at '{file_path}'")
	return

	print("=" * 50)
	print(" HDF5 Dataset Inspection Report")
	print("=" * 50)
	print(f"Analyzing file: '{file_path}'\n")

	with h5py.File(file_path, 'r') as hf:
	if 'labels' not in hf or 'images' not in hf:
	print("Error: HDF5 file is missing 'images' or 'labels' datasets.")
	return

	labels = hf['labels'][:]
	total_images = hf['images'].shape[0]

	unique_labels, counts = np.unique(labels, return_counts=True)
	char_counts = {chr(int(label)): count for label, count in zip(unique_labels, counts)}

	print("--- Character Frequency ---")
	sorted_chars = sorted(char_counts.items(), key=lambda item: item[1], reverse=True)

	for char, count in sorted_chars:
	print(f"Character: '{char}' \| Samples: {count}")

	print("\n" + "-" * 27)

	print("\n--- Summary ---")
	print(f"Total number of unique characters: {len(char_counts)}")
	print(f"Total number of image samples: {total_images}")
	print(f"Total dataset size on disk: {get_file_size(file_path)}")
	print("=" * 50)


	if __name__ == "__main__":
	analyze_hdf5_dataset(DATA_FILE)