""" Materialize the RealWaste dataset from Hugging Face into class folders. Usage: python scripts/fetch_realwaste.py --output_dir data/raw/realwaste """ import argparse import re import shutil from pathlib import Path from huggingface_hub import hf_hub_download, list_repo_files DATASET_REPO = "shahzaibvohra/realwaste" ALLOWED_PREFIXES = { "Cardboard": "cardboard", "Food Organics": "food_organics", "Glass": "glass", "Metal": "metal", "Paper": "paper", "Plastic": "plastic", "Vegetation": "vegetation", } def materialize_realwaste(output_dir: Path, prefixes: set[str] | None = None) -> None: output_dir.mkdir(parents=True, exist_ok=True) files = list_repo_files(DATASET_REPO, repo_type="dataset") for file_name in files: if not file_name.lower().endswith(".jpg"): continue match = re.match(r"^(.*)_\d+\.jpg$", file_name) if not match: continue prefix = match.group(1) mapped_prefix = ALLOWED_PREFIXES.get(prefix) if mapped_prefix is None: continue if prefixes and mapped_prefix not in prefixes: continue class_dir = output_dir / mapped_prefix class_dir.mkdir(parents=True, exist_ok=True) destination = class_dir / file_name if destination.exists(): continue cached_path = hf_hub_download( repo_id=DATASET_REPO, repo_type="dataset", filename=file_name, ) shutil.copy2(cached_path, destination) def main(): parser = argparse.ArgumentParser(description="Download and organize RealWaste dataset") parser.add_argument("--output_dir", default="data/raw/realwaste") parser.add_argument( "--prefixes", nargs="*", default=None, help="Optional subset of local class folders to download, e.g. organic plastic paper", ) args = parser.parse_args() selected_prefixes = set(args.prefixes) if args.prefixes else None materialize_realwaste(Path(args.output_dir), selected_prefixes) print(f"RealWaste materialized at: {Path(args.output_dir).resolve()}") if __name__ == "__main__": main()