opeCLIP-waste-wizard / analyze_dataset.py
ysfad's picture
Implement proper ML model hosting with Hugging Face Hub integration
e1a6bed
#!/usr/bin/env python3
"""Analyze the Kaggle waste dataset structure for finetuning."""
import kagglehub
import os
from pathlib import Path
from collections import defaultdict
import json
def analyze_dataset():
print("🔄 Getting dataset path...")
# Get dataset path (already downloaded)
path = kagglehub.dataset_download("alistairking/recyclable-and-household-waste-classification")
dataset_path = Path(path)
print(f"📁 Dataset path: {dataset_path}")
# Analyze structure
category_info = defaultdict(lambda: {"default": 0, "real_world": 0, "total": 0})
print("\n📊 Analyzing dataset structure...")
# Navigate to images folder
images_root = dataset_path / "images" / "images"
if not images_root.exists():
print(f"❌ Images folder not found at {images_root}")
return
# Count images per category and variant
for category_dir in images_root.iterdir():
if category_dir.is_dir():
category_name = category_dir.name
for variant_dir in category_dir.iterdir():
if variant_dir.is_dir():
variant_name = variant_dir.name
image_count = len(list(variant_dir.glob("*.png")))
category_info[category_name][variant_name] = image_count
category_info[category_name]["total"] += image_count
# Print summary
print(f"\n📋 Dataset Summary:")
print(f"{'Category':<30} {'Default':<10} {'Real-World':<12} {'Total':<8}")
print("-" * 70)
total_images = 0
for category, info in category_info.items():
default_count = info.get("default", 0)
real_world_count = info.get("real_world", 0)
total_count = info["total"]
total_images += total_count
print(f"{category:<30} {default_count:<10} {real_world_count:<12} {total_count:<8}")
print("-" * 70)
print(f"{'TOTAL':<30} {'':<10} {'':<12} {total_images:<8}")
# Save dataset info for finetuning
dataset_info = {
"dataset_path": str(dataset_path),
"images_root": str(images_root),
"categories": dict(category_info),
"total_images": total_images,
"num_categories": len(category_info)
}
with open("dataset_info.json", "w") as f:
json.dump(dataset_info, f, indent=2)
print(f"\n💾 Dataset info saved to dataset_info.json")
print(f"🎯 Found {len(category_info)} categories with {total_images} total images")
return dataset_info
if __name__ == "__main__":
analyze_dataset()