Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """Analyze the Kaggle waste dataset structure for finetuning.""" | |
| import kagglehub | |
| import os | |
| from pathlib import Path | |
| from collections import defaultdict | |
| import json | |
| def analyze_dataset(): | |
| print("🔄 Getting dataset path...") | |
| # Get dataset path (already downloaded) | |
| path = kagglehub.dataset_download("alistairking/recyclable-and-household-waste-classification") | |
| dataset_path = Path(path) | |
| print(f"📁 Dataset path: {dataset_path}") | |
| # Analyze structure | |
| category_info = defaultdict(lambda: {"default": 0, "real_world": 0, "total": 0}) | |
| print("\n📊 Analyzing dataset structure...") | |
| # Navigate to images folder | |
| images_root = dataset_path / "images" / "images" | |
| if not images_root.exists(): | |
| print(f"❌ Images folder not found at {images_root}") | |
| return | |
| # Count images per category and variant | |
| for category_dir in images_root.iterdir(): | |
| if category_dir.is_dir(): | |
| category_name = category_dir.name | |
| for variant_dir in category_dir.iterdir(): | |
| if variant_dir.is_dir(): | |
| variant_name = variant_dir.name | |
| image_count = len(list(variant_dir.glob("*.png"))) | |
| category_info[category_name][variant_name] = image_count | |
| category_info[category_name]["total"] += image_count | |
| # Print summary | |
| print(f"\n📋 Dataset Summary:") | |
| print(f"{'Category':<30} {'Default':<10} {'Real-World':<12} {'Total':<8}") | |
| print("-" * 70) | |
| total_images = 0 | |
| for category, info in category_info.items(): | |
| default_count = info.get("default", 0) | |
| real_world_count = info.get("real_world", 0) | |
| total_count = info["total"] | |
| total_images += total_count | |
| print(f"{category:<30} {default_count:<10} {real_world_count:<12} {total_count:<8}") | |
| print("-" * 70) | |
| print(f"{'TOTAL':<30} {'':<10} {'':<12} {total_images:<8}") | |
| # Save dataset info for finetuning | |
| dataset_info = { | |
| "dataset_path": str(dataset_path), | |
| "images_root": str(images_root), | |
| "categories": dict(category_info), | |
| "total_images": total_images, | |
| "num_categories": len(category_info) | |
| } | |
| with open("dataset_info.json", "w") as f: | |
| json.dump(dataset_info, f, indent=2) | |
| print(f"\n💾 Dataset info saved to dataset_info.json") | |
| print(f"🎯 Found {len(category_info)} categories with {total_images} total images") | |
| return dataset_info | |
| if __name__ == "__main__": | |
| analyze_dataset() |