chiruu12
Initial commit of clean OCR application
9543569
Raw
History Blame Contribute Delete
1.8 kB
import os
import h5py
import numpy as np
DATA_FILE = "data/book_dataset.h5"
def get_file_size(path):
"""Calculates the size of a single file."""
size_bytes = os.path.getsize(path)
if size_bytes > 1024 * 1024 * 1024:
return f"{size_bytes / (1024 ** 3):.2f} GB"
elif size_bytes > 1024 * 1024:
return f"{size_bytes / (1024 ** 2):.2f} MB"
return f"{size_bytes / 1024:.2f} KB"
def analyze_hdf5_dataset(file_path):
"""
Analyzes the HDF5 character dataset, printing a summary of its contents.
"""
if not os.path.exists(file_path):
print(f"Error: Dataset file not found at '{file_path}'")
return
print("=" * 50)
print(" HDF5 Dataset Inspection Report")
print("=" * 50)
print(f"Analyzing file: '{file_path}'\n")
with h5py.File(file_path, 'r') as hf:
if 'labels' not in hf or 'images' not in hf:
print("Error: HDF5 file is missing 'images' or 'labels' datasets.")
return
labels = hf['labels'][:]
total_images = hf['images'].shape[0]
unique_labels, counts = np.unique(labels, return_counts=True)
char_counts = {chr(int(label)): count for label, count in zip(unique_labels, counts)}
print("--- Character Frequency ---")
sorted_chars = sorted(char_counts.items(), key=lambda item: item[1], reverse=True)
for char, count in sorted_chars:
print(f"Character: '{char}' | Samples: {count}")
print("\n" + "-" * 27)
print("\n--- Summary ---")
print(f"Total number of unique characters: {len(char_counts)}")
print(f"Total number of image samples: {total_images}")
print(f"Total dataset size on disk: {get_file_size(file_path)}")
print("=" * 50)
if __name__ == "__main__":
analyze_hdf5_dataset(DATA_FILE)