chest-cancer-classifier-app / class_check.py
ALYYAN's picture
Initial commit of clean, working project
d576da9
raw
history blame
2.09 kB
# check_data_balance.py
import os
from pathlib import Path
def check_dataset_balance(data_path: Path):
"""
Checks and prints the balance of classes in a dataset directory.
The expected directory structure is:
- data_path/
- class_A/
- image1.jpg
- image2.jpg
...
- class_B/
- image1.jpg
- image2.jpg
...
Args:
data_path (Path): The path to the main dataset directory.
"""
print(f"--- Checking Dataset Balance at: {data_path} ---\n")
if not data_path.is_dir():
print(f"❌ ERROR: The provided path is not a valid directory.")
return
class_names = [d.name for d in data_path.iterdir() if d.is_dir()]
if not class_names:
print("❌ ERROR: No class subdirectories found in the dataset folder.")
return
print(f"Found {len(class_names)} classes: {', '.join(class_names)}\n")
class_counts = {}
total_images = 0
for class_name in class_names:
class_dir = data_path / class_name
# Count files, ignoring subdirectories (like .ipynb_checkpoints)
num_images = len([f for f in class_dir.iterdir() if f.is_file()])
class_counts[class_name] = num_images
total_images += num_images
print("--- Image Counts per Class ---")
for class_name, count in class_counts.items():
percentage = (count / total_images) * 100 if total_images > 0 else 0
print(f"- {class_name:<20}: {count:>5} images ({percentage:.2f}%)")
print("-" * 35)
print(f"- {'Total':<20}: {total_images:>5} images\n")
print("--- For your training script ---")
print("Use these counts to calculate your class_weight dictionary.")
if __name__ == "__main__":
# --- IMPORTANT ---
# Update this path to point to your actual dataset folder.
# This is the folder that contains the 'Normal' and 'adenocarcinoma' subfolders.
dataset_directory = Path("artifacts/data_ingestion/Chest-CT-Scan-data")
check_dataset_balance(dataset_directory)