File size: 2,209 Bytes
206d8b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Materialize the RealWaste dataset from Hugging Face into class folders.

Usage:
    python scripts/fetch_realwaste.py --output_dir data/raw/realwaste
"""

import argparse
import re
import shutil
from pathlib import Path

from huggingface_hub import hf_hub_download, list_repo_files

DATASET_REPO = "shahzaibvohra/realwaste"

ALLOWED_PREFIXES = {
    "Cardboard": "cardboard",
    "Food Organics": "food_organics",
    "Glass": "glass",
    "Metal": "metal",
    "Paper": "paper",
    "Plastic": "plastic",
    "Vegetation": "vegetation",
}


def materialize_realwaste(output_dir: Path, prefixes: set[str] | None = None) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
    files = list_repo_files(DATASET_REPO, repo_type="dataset")

    for file_name in files:
        if not file_name.lower().endswith(".jpg"):
            continue

        match = re.match(r"^(.*)_\d+\.jpg$", file_name)
        if not match:
            continue

        prefix = match.group(1)
        mapped_prefix = ALLOWED_PREFIXES.get(prefix)
        if mapped_prefix is None:
            continue
        if prefixes and mapped_prefix not in prefixes:
            continue

        class_dir = output_dir / mapped_prefix
        class_dir.mkdir(parents=True, exist_ok=True)
        destination = class_dir / file_name
        if destination.exists():
            continue

        cached_path = hf_hub_download(
            repo_id=DATASET_REPO,
            repo_type="dataset",
            filename=file_name,
        )
        shutil.copy2(cached_path, destination)


def main():
    parser = argparse.ArgumentParser(description="Download and organize RealWaste dataset")
    parser.add_argument("--output_dir", default="data/raw/realwaste")
    parser.add_argument(
        "--prefixes",
        nargs="*",
        default=None,
        help="Optional subset of local class folders to download, e.g. organic plastic paper",
    )
    args = parser.parse_args()

    selected_prefixes = set(args.prefixes) if args.prefixes else None
    materialize_realwaste(Path(args.output_dir), selected_prefixes)
    print(f"RealWaste materialized at: {Path(args.output_dir).resolve()}")


if __name__ == "__main__":
    main()