| import os |
| import h5py |
| import numpy as np |
|
|
| DATA_FILE = "data/book_dataset.h5" |
|
|
|
|
| def get_file_size(path): |
| """Calculates the size of a single file.""" |
| size_bytes = os.path.getsize(path) |
| if size_bytes > 1024 * 1024 * 1024: |
| return f"{size_bytes / (1024 ** 3):.2f} GB" |
| elif size_bytes > 1024 * 1024: |
| return f"{size_bytes / (1024 ** 2):.2f} MB" |
| return f"{size_bytes / 1024:.2f} KB" |
|
|
|
|
| def analyze_hdf5_dataset(file_path): |
| """ |
| Analyzes the HDF5 character dataset, printing a summary of its contents. |
| """ |
| if not os.path.exists(file_path): |
| print(f"Error: Dataset file not found at '{file_path}'") |
| return |
|
|
| print("=" * 50) |
| print(" HDF5 Dataset Inspection Report") |
| print("=" * 50) |
| print(f"Analyzing file: '{file_path}'\n") |
|
|
| with h5py.File(file_path, 'r') as hf: |
| if 'labels' not in hf or 'images' not in hf: |
| print("Error: HDF5 file is missing 'images' or 'labels' datasets.") |
| return |
|
|
| labels = hf['labels'][:] |
| total_images = hf['images'].shape[0] |
|
|
| unique_labels, counts = np.unique(labels, return_counts=True) |
| char_counts = {chr(int(label)): count for label, count in zip(unique_labels, counts)} |
|
|
| print("--- Character Frequency ---") |
| sorted_chars = sorted(char_counts.items(), key=lambda item: item[1], reverse=True) |
|
|
| for char, count in sorted_chars: |
| print(f"Character: '{char}' | Samples: {count}") |
|
|
| print("\n" + "-" * 27) |
|
|
| print("\n--- Summary ---") |
| print(f"Total number of unique characters: {len(char_counts)}") |
| print(f"Total number of image samples: {total_images}") |
| print(f"Total dataset size on disk: {get_file_size(file_path)}") |
| print("=" * 50) |
|
|
|
|
| if __name__ == "__main__": |
| analyze_hdf5_dataset(DATA_FILE) |