Spaces:
Running
Running
| """ | |
| Materialize the RealWaste dataset from Hugging Face into class folders. | |
| Usage: | |
| python scripts/fetch_realwaste.py --output_dir data/raw/realwaste | |
| """ | |
| import argparse | |
| import re | |
| import shutil | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download, list_repo_files | |
| DATASET_REPO = "shahzaibvohra/realwaste" | |
| ALLOWED_PREFIXES = { | |
| "Cardboard": "cardboard", | |
| "Food Organics": "food_organics", | |
| "Glass": "glass", | |
| "Metal": "metal", | |
| "Paper": "paper", | |
| "Plastic": "plastic", | |
| "Vegetation": "vegetation", | |
| } | |
| def materialize_realwaste(output_dir: Path, prefixes: set[str] | None = None) -> None: | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| files = list_repo_files(DATASET_REPO, repo_type="dataset") | |
| for file_name in files: | |
| if not file_name.lower().endswith(".jpg"): | |
| continue | |
| match = re.match(r"^(.*)_\d+\.jpg$", file_name) | |
| if not match: | |
| continue | |
| prefix = match.group(1) | |
| mapped_prefix = ALLOWED_PREFIXES.get(prefix) | |
| if mapped_prefix is None: | |
| continue | |
| if prefixes and mapped_prefix not in prefixes: | |
| continue | |
| class_dir = output_dir / mapped_prefix | |
| class_dir.mkdir(parents=True, exist_ok=True) | |
| destination = class_dir / file_name | |
| if destination.exists(): | |
| continue | |
| cached_path = hf_hub_download( | |
| repo_id=DATASET_REPO, | |
| repo_type="dataset", | |
| filename=file_name, | |
| ) | |
| shutil.copy2(cached_path, destination) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Download and organize RealWaste dataset") | |
| parser.add_argument("--output_dir", default="data/raw/realwaste") | |
| parser.add_argument( | |
| "--prefixes", | |
| nargs="*", | |
| default=None, | |
| help="Optional subset of local class folders to download, e.g. organic plastic paper", | |
| ) | |
| args = parser.parse_args() | |
| selected_prefixes = set(args.prefixes) if args.prefixes else None | |
| materialize_realwaste(Path(args.output_dir), selected_prefixes) | |
| print(f"RealWaste materialized at: {Path(args.output_dir).resolve()}") | |
| if __name__ == "__main__": | |
| main() | |