|
|
|
|
|
|
|
|
"""
|
|
|
Fetch images into data/external/<Class>/ from the UCI RealWaste dataset (no keys).
|
|
|
Classes: Cardboard, Glass, Metal, Paper, Plastic, Trash
|
|
|
|
|
|
Usage (Windows PowerShell):
|
|
|
pip install pillow requests
|
|
|
python src\fetch_images.py --dest data\external --per-class 500
|
|
|
"""
|
|
|
|
|
|
from __future__ import annotations
|
|
|
import argparse
|
|
|
import io
|
|
|
import zipfile
|
|
|
import random
|
|
|
import sys
|
|
|
from typing import Dict, List, Tuple, Optional
|
|
|
import pathlib
|
|
|
|
|
|
import requests
|
|
|
from PIL import Image, ImageOps
|
|
|
|
|
|
|
|
|
|
|
|
CLASSES = ["Cardboard", "Glass", "Metal", "Paper", "Plastic", "Trash"]
|
|
|
|
|
|
|
|
|
REALWASTE_TO_OURS: Dict[str, str] = {
|
|
|
"Cardboard": "Cardboard",
|
|
|
"Glass": "Glass",
|
|
|
"Metal": "Metal",
|
|
|
"Paper": "Paper",
|
|
|
"Plastic": "Plastic",
|
|
|
"Miscellaneous Trash": "Trash",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
UCI_REALWASTE_ZIP = "https://archive.ics.uci.edu/static/public/908/realwaste.zip"
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_dir(p: pathlib.Path) -> None:
|
|
|
p.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def dhash(img: Image.Image, size: int = 8) -> str:
|
|
|
"""8x8 difference hash -> hex string."""
|
|
|
img = ImageOps.exif_transpose(img).convert("L").resize((size + 1, size), Image.Resampling.LANCZOS)
|
|
|
pixels = list(img.getdata())
|
|
|
rows = [pixels[i*(size+1):(i+1)*(size+1)] for i in range(size)]
|
|
|
bits = []
|
|
|
for row in rows:
|
|
|
for a, b in zip(row, row[1:]):
|
|
|
bits.append('1' if a > b else '0')
|
|
|
return f"{int(''.join(bits), 2):0{size*size//4}x}"
|
|
|
|
|
|
def file_dhash(path: pathlib.Path) -> Optional[str]:
|
|
|
try:
|
|
|
with Image.open(path) as im:
|
|
|
return dhash(im)
|
|
|
except Exception:
|
|
|
return None
|
|
|
|
|
|
def collect_existing_hashes(root: pathlib.Path) -> Dict[str, set]:
|
|
|
per = {c: set() for c in CLASSES}
|
|
|
for c in CLASSES:
|
|
|
d = root / c
|
|
|
if not d.exists():
|
|
|
continue
|
|
|
for p in d.glob("*.jpg"):
|
|
|
h = file_dhash(p)
|
|
|
if h:
|
|
|
per[c].add(h)
|
|
|
return per
|
|
|
|
|
|
def save_image_to_class(img: Image.Image, class_dir: pathlib.Path, existing_hashes: set, min_side: int) -> Tuple[bool, str]:
|
|
|
"""Save as JPEG with hash filename; return (kept?, reason_or_empty)."""
|
|
|
try:
|
|
|
i = ImageOps.exif_transpose(img).convert("RGB")
|
|
|
except Exception as e:
|
|
|
return False, f"decode_fail:{e}"
|
|
|
if min(i.size) < min_side:
|
|
|
return False, "too_small"
|
|
|
h = dhash(i)
|
|
|
if h in existing_hashes:
|
|
|
return False, "dup"
|
|
|
tmp = class_dir / f"{h}.jpg"
|
|
|
try:
|
|
|
i.save(tmp, format="JPEG", quality=92, optimize=True)
|
|
|
except Exception as e:
|
|
|
return False, f"save_fail:{e}"
|
|
|
existing_hashes.add(h)
|
|
|
return True, ""
|
|
|
|
|
|
def human_count(d: Dict[str, int]) -> str:
|
|
|
return ", ".join(f"{k}:{v}" for k, v in d.items())
|
|
|
|
|
|
|
|
|
|
|
|
def download_realwaste(zip_path: pathlib.Path) -> None:
|
|
|
ensure_dir(zip_path.parent)
|
|
|
if zip_path.exists():
|
|
|
print("[RealWaste] Using cached zip.")
|
|
|
return
|
|
|
print("[RealWaste] Downloading zip (≈650MB)…")
|
|
|
headers = {"User-Agent": "RecycleAI/1.0 (dataset fetcher)"}
|
|
|
with requests.get(UCI_REALWASTE_ZIP, stream=True, timeout=300, headers=headers) as r:
|
|
|
r.raise_for_status()
|
|
|
with open(zip_path, "wb") as f:
|
|
|
for chunk in r.iter_content(chunk_size=1 << 20):
|
|
|
if chunk:
|
|
|
f.write(chunk)
|
|
|
|
|
|
def unzip_any(zip_path: pathlib.Path, out_dir: pathlib.Path) -> None:
|
|
|
ensure_dir(out_dir)
|
|
|
if any(out_dir.iterdir()):
|
|
|
print("[RealWaste] Using existing unzipped contents.")
|
|
|
return
|
|
|
print("[RealWaste] Unzipping…")
|
|
|
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
|
zf.extractall(out_dir)
|
|
|
|
|
|
def locate_realwaste_root(unzipped_root: pathlib.Path) -> Optional[pathlib.Path]:
|
|
|
"""
|
|
|
Find the directory containing class subfolders like:
|
|
|
RealWaste/Cardboard, RealWaste/Glass, …
|
|
|
The zip typically has: realwaste-main/RealWaste/<Class>/*.jpg
|
|
|
"""
|
|
|
candidates = []
|
|
|
for p in unzipped_root.rglob("*"):
|
|
|
if p.is_dir():
|
|
|
|
|
|
hits = 0
|
|
|
for src_name in REALWASTE_TO_OURS.keys():
|
|
|
if (p / src_name).exists():
|
|
|
hits += 1
|
|
|
if hits >= 3:
|
|
|
candidates.append(p)
|
|
|
if not candidates:
|
|
|
return None
|
|
|
|
|
|
return max(candidates, key=lambda x: len(x.parts))
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_from_realwaste(dest_root: pathlib.Path, per_class: int, min_side: int) -> Dict[str, int]:
|
|
|
raw_dir = pathlib.Path("data/raw/realwaste")
|
|
|
zip_path = raw_dir / "realwaste.zip"
|
|
|
out_dir = raw_dir / "unzipped"
|
|
|
|
|
|
download_realwaste(zip_path)
|
|
|
unzip_any(zip_path, out_dir)
|
|
|
|
|
|
data_root = locate_realwaste_root(out_dir)
|
|
|
if data_root is None:
|
|
|
raise RuntimeError("Could not locate RealWaste class folders after unzip.")
|
|
|
|
|
|
print(f"[RealWaste] Data root: {data_root}")
|
|
|
|
|
|
|
|
|
for c in CLASSES:
|
|
|
ensure_dir(dest_root / c)
|
|
|
existing = collect_existing_hashes(dest_root)
|
|
|
|
|
|
added = {c: 0 for c in CLASSES}
|
|
|
dropped = {"dup": 0, "too_small": 0, "decode_fail": 0, "save_fail": 0}
|
|
|
|
|
|
|
|
|
for src_name, tgt_name in REALWASTE_TO_OURS.items():
|
|
|
|
|
|
if added[tgt_name] >= per_class:
|
|
|
continue
|
|
|
|
|
|
src_dir = data_root / src_name
|
|
|
if not src_dir.exists():
|
|
|
continue
|
|
|
|
|
|
|
|
|
files: List[pathlib.Path] = []
|
|
|
for ext in ("*.jpg", "*.jpeg", "*.png", "*.bmp"):
|
|
|
files.extend(src_dir.rglob(ext))
|
|
|
if not files:
|
|
|
continue
|
|
|
random.shuffle(files)
|
|
|
|
|
|
tgt_dir = dest_root / tgt_name
|
|
|
for p in files:
|
|
|
if added[tgt_name] >= per_class:
|
|
|
break
|
|
|
try:
|
|
|
with Image.open(p) as im:
|
|
|
ok, reason = save_image_to_class(im, tgt_dir, existing[tgt_name], min_side)
|
|
|
except Exception as e:
|
|
|
ok, reason = False, f"decode_fail:{e}"
|
|
|
|
|
|
if ok:
|
|
|
added[tgt_name] += 1
|
|
|
else:
|
|
|
|
|
|
key = reason.split(":")[0] if reason else "unknown"
|
|
|
dropped[key] = dropped.get(key, 0) + 1
|
|
|
|
|
|
print(f"[RealWaste] Added -> {human_count(added)}")
|
|
|
print(f"[RealWaste] Drops -> " + ", ".join(f"{k}:{v}" for k, v in dropped.items()))
|
|
|
return added
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
ap = argparse.ArgumentParser()
|
|
|
ap.add_argument("--dest", default="data/external", help="Output root directory")
|
|
|
ap.add_argument("--per-class", type=int, default=500, help="Max images per class to add")
|
|
|
ap.add_argument("--min-side", type=int, default=224, help="Minimum shorter-side in pixels")
|
|
|
args = ap.parse_args()
|
|
|
|
|
|
dest_root = pathlib.Path(args.dest)
|
|
|
for c in CLASSES:
|
|
|
ensure_dir(dest_root / c)
|
|
|
|
|
|
total = fetch_from_realwaste(dest_root, args.per_class, args.min_side)
|
|
|
|
|
|
print(f"[TOTAL] Added -> {human_count(total)}")
|
|
|
print(f"Images are saved under: {dest_root}")
|
|
|
print("Next: merge with src\\merge_external_into_dataset.py")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|