RecycloAI / src /fetch_images.py
seun829's picture
Upload 40 files
b5cb408 verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Fetch images into data/external/<Class>/ from the UCI RealWaste dataset (no keys).
Classes: Cardboard, Glass, Metal, Paper, Plastic, Trash
Usage (Windows PowerShell):
pip install pillow requests
python src\fetch_images.py --dest data\external --per-class 500
"""
from __future__ import annotations
import argparse
import io
import zipfile
import random
import sys
from typing import Dict, List, Tuple, Optional
import pathlib
import requests
from PIL import Image, ImageOps
# ---------------- Config ----------------
CLASSES = ["Cardboard", "Glass", "Metal", "Paper", "Plastic", "Trash"]
# RealWaste has these class folders; we map them to our 6:
REALWASTE_TO_OURS: Dict[str, str] = {
"Cardboard": "Cardboard",
"Glass": "Glass",
"Metal": "Metal",
"Paper": "Paper",
"Plastic": "Plastic",
"Miscellaneous Trash": "Trash",
# ignore: "Food Organics", "Textile Trash", "Vegetation"
}
# Direct public ZIP from UCI (RealWaste dataset)
UCI_REALWASTE_ZIP = "https://archive.ics.uci.edu/static/public/908/realwaste.zip"
# ---------------- Utils ----------------
def ensure_dir(p: pathlib.Path) -> None:
p.mkdir(parents=True, exist_ok=True)
def dhash(img: Image.Image, size: int = 8) -> str:
"""8x8 difference hash -> hex string."""
img = ImageOps.exif_transpose(img).convert("L").resize((size + 1, size), Image.Resampling.LANCZOS)
pixels = list(img.getdata())
rows = [pixels[i*(size+1):(i+1)*(size+1)] for i in range(size)]
bits = []
for row in rows:
for a, b in zip(row, row[1:]):
bits.append('1' if a > b else '0')
return f"{int(''.join(bits), 2):0{size*size//4}x}"
def file_dhash(path: pathlib.Path) -> Optional[str]:
try:
with Image.open(path) as im:
return dhash(im)
except Exception:
return None
def collect_existing_hashes(root: pathlib.Path) -> Dict[str, set]:
per = {c: set() for c in CLASSES}
for c in CLASSES:
d = root / c
if not d.exists():
continue
for p in d.glob("*.jpg"):
h = file_dhash(p)
if h:
per[c].add(h)
return per
def save_image_to_class(img: Image.Image, class_dir: pathlib.Path, existing_hashes: set, min_side: int) -> Tuple[bool, str]:
"""Save as JPEG with hash filename; return (kept?, reason_or_empty)."""
try:
i = ImageOps.exif_transpose(img).convert("RGB")
except Exception as e:
return False, f"decode_fail:{e}"
if min(i.size) < min_side:
return False, "too_small"
h = dhash(i)
if h in existing_hashes:
return False, "dup"
tmp = class_dir / f"{h}.jpg"
try:
i.save(tmp, format="JPEG", quality=92, optimize=True)
except Exception as e:
return False, f"save_fail:{e}"
existing_hashes.add(h)
return True, ""
def human_count(d: Dict[str, int]) -> str:
return ", ".join(f"{k}:{v}" for k, v in d.items())
# --------------- Download / Extract ---------------
def download_realwaste(zip_path: pathlib.Path) -> None:
ensure_dir(zip_path.parent)
if zip_path.exists():
print("[RealWaste] Using cached zip.")
return
print("[RealWaste] Downloading zip (≈650MB)…")
headers = {"User-Agent": "RecycleAI/1.0 (dataset fetcher)"}
with requests.get(UCI_REALWASTE_ZIP, stream=True, timeout=300, headers=headers) as r:
r.raise_for_status()
with open(zip_path, "wb") as f:
for chunk in r.iter_content(chunk_size=1 << 20):
if chunk:
f.write(chunk)
def unzip_any(zip_path: pathlib.Path, out_dir: pathlib.Path) -> None:
ensure_dir(out_dir)
if any(out_dir.iterdir()):
print("[RealWaste] Using existing unzipped contents.")
return
print("[RealWaste] Unzipping…")
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(out_dir)
def locate_realwaste_root(unzipped_root: pathlib.Path) -> Optional[pathlib.Path]:
"""
Find the directory containing class subfolders like:
RealWaste/Cardboard, RealWaste/Glass, …
The zip typically has: realwaste-main/RealWaste/<Class>/*.jpg
"""
candidates = []
for p in unzipped_root.rglob("*"):
if p.is_dir():
# Heuristic: must contain at least one of the expected class dirs
hits = 0
for src_name in REALWASTE_TO_OURS.keys():
if (p / src_name).exists():
hits += 1
if hits >= 3: # enough evidence
candidates.append(p)
if not candidates:
return None
# Prefer the deepest path (most specific)
return max(candidates, key=lambda x: len(x.parts))
# --------------- Fetch from RealWaste ---------------
def fetch_from_realwaste(dest_root: pathlib.Path, per_class: int, min_side: int) -> Dict[str, int]:
raw_dir = pathlib.Path("data/raw/realwaste")
zip_path = raw_dir / "realwaste.zip"
out_dir = raw_dir / "unzipped"
download_realwaste(zip_path)
unzip_any(zip_path, out_dir)
data_root = locate_realwaste_root(out_dir)
if data_root is None:
raise RuntimeError("Could not locate RealWaste class folders after unzip.")
print(f"[RealWaste] Data root: {data_root}")
# Prepare output and dedupe
for c in CLASSES:
ensure_dir(dest_root / c)
existing = collect_existing_hashes(dest_root)
added = {c: 0 for c in CLASSES}
dropped = {"dup": 0, "too_small": 0, "decode_fail": 0, "save_fail": 0}
# Iterate source classes
for src_name, tgt_name in REALWASTE_TO_OURS.items():
# Stop if target already has enough
if added[tgt_name] >= per_class:
continue
src_dir = data_root / src_name
if not src_dir.exists():
continue
# Gather files
files: List[pathlib.Path] = []
for ext in ("*.jpg", "*.jpeg", "*.png", "*.bmp"):
files.extend(src_dir.rglob(ext))
if not files:
continue
random.shuffle(files)
tgt_dir = dest_root / tgt_name
for p in files:
if added[tgt_name] >= per_class:
break
try:
with Image.open(p) as im:
ok, reason = save_image_to_class(im, tgt_dir, existing[tgt_name], min_side)
except Exception as e:
ok, reason = False, f"decode_fail:{e}"
if ok:
added[tgt_name] += 1
else:
# normalize reason key for counters
key = reason.split(":")[0] if reason else "unknown"
dropped[key] = dropped.get(key, 0) + 1
print(f"[RealWaste] Added -> {human_count(added)}")
print(f"[RealWaste] Drops -> " + ", ".join(f"{k}:{v}" for k, v in dropped.items()))
return added
# ---------------- Main ----------------
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--dest", default="data/external", help="Output root directory")
ap.add_argument("--per-class", type=int, default=500, help="Max images per class to add")
ap.add_argument("--min-side", type=int, default=224, help="Minimum shorter-side in pixels")
args = ap.parse_args()
dest_root = pathlib.Path(args.dest)
for c in CLASSES:
ensure_dir(dest_root / c)
total = fetch_from_realwaste(dest_root, args.per_class, args.min_side)
print(f"[TOTAL] Added -> {human_count(total)}")
print(f"Images are saved under: {dest_root}")
print("Next: merge with src\\merge_external_into_dataset.py")
if __name__ == "__main__":
main()