#!/usr/bin/env python3 """Analyze the Kaggle waste dataset structure for finetuning.""" import kagglehub import os from pathlib import Path from collections import defaultdict import json def analyze_dataset(): print("šŸ”„ Getting dataset path...") # Get dataset path (already downloaded) path = kagglehub.dataset_download("alistairking/recyclable-and-household-waste-classification") dataset_path = Path(path) print(f"šŸ“ Dataset path: {dataset_path}") # Analyze structure category_info = defaultdict(lambda: {"default": 0, "real_world": 0, "total": 0}) print("\nšŸ“Š Analyzing dataset structure...") # Navigate to images folder images_root = dataset_path / "images" / "images" if not images_root.exists(): print(f"āŒ Images folder not found at {images_root}") return # Count images per category and variant for category_dir in images_root.iterdir(): if category_dir.is_dir(): category_name = category_dir.name for variant_dir in category_dir.iterdir(): if variant_dir.is_dir(): variant_name = variant_dir.name image_count = len(list(variant_dir.glob("*.png"))) category_info[category_name][variant_name] = image_count category_info[category_name]["total"] += image_count # Print summary print(f"\nšŸ“‹ Dataset Summary:") print(f"{'Category':<30} {'Default':<10} {'Real-World':<12} {'Total':<8}") print("-" * 70) total_images = 0 for category, info in category_info.items(): default_count = info.get("default", 0) real_world_count = info.get("real_world", 0) total_count = info["total"] total_images += total_count print(f"{category:<30} {default_count:<10} {real_world_count:<12} {total_count:<8}") print("-" * 70) print(f"{'TOTAL':<30} {'':<10} {'':<12} {total_images:<8}") # Save dataset info for finetuning dataset_info = { "dataset_path": str(dataset_path), "images_root": str(images_root), "categories": dict(category_info), "total_images": total_images, "num_categories": len(category_info) } with open("dataset_info.json", "w") as f: json.dump(dataset_info, f, indent=2) print(f"\nšŸ’¾ Dataset info saved to dataset_info.json") print(f"šŸŽÆ Found {len(category_info)} categories with {total_images} total images") return dataset_info if __name__ == "__main__": analyze_dataset()