CoolWasteAI / scripts /fetch_realwaste.py
Celvin
first commit
206d8b5
"""
Materialize the RealWaste dataset from Hugging Face into class folders.
Usage:
python scripts/fetch_realwaste.py --output_dir data/raw/realwaste
"""
import argparse
import re
import shutil
from pathlib import Path
from huggingface_hub import hf_hub_download, list_repo_files
DATASET_REPO = "shahzaibvohra/realwaste"
ALLOWED_PREFIXES = {
"Cardboard": "cardboard",
"Food Organics": "food_organics",
"Glass": "glass",
"Metal": "metal",
"Paper": "paper",
"Plastic": "plastic",
"Vegetation": "vegetation",
}
def materialize_realwaste(output_dir: Path, prefixes: set[str] | None = None) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
files = list_repo_files(DATASET_REPO, repo_type="dataset")
for file_name in files:
if not file_name.lower().endswith(".jpg"):
continue
match = re.match(r"^(.*)_\d+\.jpg$", file_name)
if not match:
continue
prefix = match.group(1)
mapped_prefix = ALLOWED_PREFIXES.get(prefix)
if mapped_prefix is None:
continue
if prefixes and mapped_prefix not in prefixes:
continue
class_dir = output_dir / mapped_prefix
class_dir.mkdir(parents=True, exist_ok=True)
destination = class_dir / file_name
if destination.exists():
continue
cached_path = hf_hub_download(
repo_id=DATASET_REPO,
repo_type="dataset",
filename=file_name,
)
shutil.copy2(cached_path, destination)
def main():
parser = argparse.ArgumentParser(description="Download and organize RealWaste dataset")
parser.add_argument("--output_dir", default="data/raw/realwaste")
parser.add_argument(
"--prefixes",
nargs="*",
default=None,
help="Optional subset of local class folders to download, e.g. organic plastic paper",
)
args = parser.parse_args()
selected_prefixes = set(args.prefixes) if args.prefixes else None
materialize_realwaste(Path(args.output_dir), selected_prefixes)
print(f"RealWaste materialized at: {Path(args.output_dir).resolve()}")
if __name__ == "__main__":
main()