Spaces:
Sleeping
Sleeping
File size: 2,094 Bytes
d576da9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# check_data_balance.py
import os
from pathlib import Path
def check_dataset_balance(data_path: Path):
"""
Checks and prints the balance of classes in a dataset directory.
The expected directory structure is:
- data_path/
- class_A/
- image1.jpg
- image2.jpg
...
- class_B/
- image1.jpg
- image2.jpg
...
Args:
data_path (Path): The path to the main dataset directory.
"""
print(f"--- Checking Dataset Balance at: {data_path} ---\n")
if not data_path.is_dir():
print(f"❌ ERROR: The provided path is not a valid directory.")
return
class_names = [d.name for d in data_path.iterdir() if d.is_dir()]
if not class_names:
print("❌ ERROR: No class subdirectories found in the dataset folder.")
return
print(f"Found {len(class_names)} classes: {', '.join(class_names)}\n")
class_counts = {}
total_images = 0
for class_name in class_names:
class_dir = data_path / class_name
# Count files, ignoring subdirectories (like .ipynb_checkpoints)
num_images = len([f for f in class_dir.iterdir() if f.is_file()])
class_counts[class_name] = num_images
total_images += num_images
print("--- Image Counts per Class ---")
for class_name, count in class_counts.items():
percentage = (count / total_images) * 100 if total_images > 0 else 0
print(f"- {class_name:<20}: {count:>5} images ({percentage:.2f}%)")
print("-" * 35)
print(f"- {'Total':<20}: {total_images:>5} images\n")
print("--- For your training script ---")
print("Use these counts to calculate your class_weight dictionary.")
if __name__ == "__main__":
# --- IMPORTANT ---
# Update this path to point to your actual dataset folder.
# This is the folder that contains the 'Normal' and 'adenocarcinoma' subfolders.
dataset_directory = Path("artifacts/data_ingestion/Chest-CT-Scan-data")
check_dataset_balance(dataset_directory) |