Spaces:

seun829
/

RecycloAI

Sleeping

File size: 7,866 Bytes

b5cb408

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

Fetch images into data/external/<Class>/ from the UCI RealWaste dataset (no keys).

Classes: Cardboard, Glass, Metal, Paper, Plastic, Trash



Usage (Windows PowerShell):

  pip install pillow requests

  python src\fetch_images.py --dest data\external --per-class 500

"""

from __future__ import annotations
import argparse
import io
import zipfile
import random
import sys
from typing import Dict, List, Tuple, Optional
import pathlib

import requests
from PIL import Image, ImageOps

# ---------------- Config ----------------

CLASSES = ["Cardboard", "Glass", "Metal", "Paper", "Plastic", "Trash"]

# RealWaste has these class folders; we map them to our 6:
REALWASTE_TO_OURS: Dict[str, str] = {
    "Cardboard": "Cardboard",
    "Glass": "Glass",
    "Metal": "Metal",
    "Paper": "Paper",
    "Plastic": "Plastic",
    "Miscellaneous Trash": "Trash",
    # ignore: "Food Organics", "Textile Trash", "Vegetation"
}

# Direct public ZIP from UCI (RealWaste dataset)
UCI_REALWASTE_ZIP = "https://archive.ics.uci.edu/static/public/908/realwaste.zip"

# ---------------- Utils ----------------

def ensure_dir(p: pathlib.Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def dhash(img: Image.Image, size: int = 8) -> str:
    """8x8 difference hash -> hex string."""
    img = ImageOps.exif_transpose(img).convert("L").resize((size + 1, size), Image.Resampling.LANCZOS)
    pixels = list(img.getdata())
    rows = [pixels[i*(size+1):(i+1)*(size+1)] for i in range(size)]
    bits = []
    for row in rows:
        for a, b in zip(row, row[1:]):
            bits.append('1' if a > b else '0')
    return f"{int(''.join(bits), 2):0{size*size//4}x}"

def file_dhash(path: pathlib.Path) -> Optional[str]:
    try:
        with Image.open(path) as im:
            return dhash(im)
    except Exception:
        return None

def collect_existing_hashes(root: pathlib.Path) -> Dict[str, set]:
    per = {c: set() for c in CLASSES}
    for c in CLASSES:
        d = root / c
        if not d.exists():
            continue
        for p in d.glob("*.jpg"):
            h = file_dhash(p)
            if h:
                per[c].add(h)
    return per

def save_image_to_class(img: Image.Image, class_dir: pathlib.Path, existing_hashes: set, min_side: int) -> Tuple[bool, str]:
    """Save as JPEG with hash filename; return (kept?, reason_or_empty)."""
    try:
        i = ImageOps.exif_transpose(img).convert("RGB")
    except Exception as e:
        return False, f"decode_fail:{e}"
    if min(i.size) < min_side:
        return False, "too_small"
    h = dhash(i)
    if h in existing_hashes:
        return False, "dup"
    tmp = class_dir / f"{h}.jpg"
    try:
        i.save(tmp, format="JPEG", quality=92, optimize=True)
    except Exception as e:
        return False, f"save_fail:{e}"
    existing_hashes.add(h)
    return True, ""

def human_count(d: Dict[str, int]) -> str:
    return ", ".join(f"{k}:{v}" for k, v in d.items())

# --------------- Download / Extract ---------------

def download_realwaste(zip_path: pathlib.Path) -> None:
    ensure_dir(zip_path.parent)
    if zip_path.exists():
        print("[RealWaste] Using cached zip.")
        return
    print("[RealWaste] Downloading zip (≈650MB)…")
    headers = {"User-Agent": "RecycleAI/1.0 (dataset fetcher)"}
    with requests.get(UCI_REALWASTE_ZIP, stream=True, timeout=300, headers=headers) as r:
        r.raise_for_status()
        with open(zip_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1 << 20):
                if chunk:
                    f.write(chunk)

def unzip_any(zip_path: pathlib.Path, out_dir: pathlib.Path) -> None:
    ensure_dir(out_dir)
    if any(out_dir.iterdir()):
        print("[RealWaste] Using existing unzipped contents.")
        return
    print("[RealWaste] Unzipping…")
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(out_dir)

def locate_realwaste_root(unzipped_root: pathlib.Path) -> Optional[pathlib.Path]:
    """

    Find the directory containing class subfolders like:

      RealWaste/Cardboard, RealWaste/Glass, …

    The zip typically has: realwaste-main/RealWaste/<Class>/*.jpg

    """
    candidates = []
    for p in unzipped_root.rglob("*"):
        if p.is_dir():
            # Heuristic: must contain at least one of the expected class dirs
            hits = 0
            for src_name in REALWASTE_TO_OURS.keys():
                if (p / src_name).exists():
                    hits += 1
            if hits >= 3:  # enough evidence
                candidates.append(p)
    if not candidates:
        return None
    # Prefer the deepest path (most specific)
    return max(candidates, key=lambda x: len(x.parts))

# --------------- Fetch from RealWaste ---------------

def fetch_from_realwaste(dest_root: pathlib.Path, per_class: int, min_side: int) -> Dict[str, int]:
    raw_dir = pathlib.Path("data/raw/realwaste")
    zip_path = raw_dir / "realwaste.zip"
    out_dir = raw_dir / "unzipped"

    download_realwaste(zip_path)
    unzip_any(zip_path, out_dir)

    data_root = locate_realwaste_root(out_dir)
    if data_root is None:
        raise RuntimeError("Could not locate RealWaste class folders after unzip.")

    print(f"[RealWaste] Data root: {data_root}")

    # Prepare output and dedupe
    for c in CLASSES:
        ensure_dir(dest_root / c)
    existing = collect_existing_hashes(dest_root)

    added = {c: 0 for c in CLASSES}
    dropped = {"dup": 0, "too_small": 0, "decode_fail": 0, "save_fail": 0}

    # Iterate source classes
    for src_name, tgt_name in REALWASTE_TO_OURS.items():
        # Stop if target already has enough
        if added[tgt_name] >= per_class:
            continue

        src_dir = data_root / src_name
        if not src_dir.exists():
            continue

        # Gather files
        files: List[pathlib.Path] = []
        for ext in ("*.jpg", "*.jpeg", "*.png", "*.bmp"):
            files.extend(src_dir.rglob(ext))
        if not files:
            continue
        random.shuffle(files)

        tgt_dir = dest_root / tgt_name
        for p in files:
            if added[tgt_name] >= per_class:
                break
            try:
                with Image.open(p) as im:
                    ok, reason = save_image_to_class(im, tgt_dir, existing[tgt_name], min_side)
            except Exception as e:
                ok, reason = False, f"decode_fail:{e}"

            if ok:
                added[tgt_name] += 1
            else:
                # normalize reason key for counters
                key = reason.split(":")[0] if reason else "unknown"
                dropped[key] = dropped.get(key, 0) + 1

    print(f"[RealWaste] Added -> {human_count(added)}")
    print(f"[RealWaste] Drops -> " + ", ".join(f"{k}:{v}" for k, v in dropped.items()))
    return added

# ---------------- Main ----------------

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--dest", default="data/external", help="Output root directory")
    ap.add_argument("--per-class", type=int, default=500, help="Max images per class to add")
    ap.add_argument("--min-side", type=int, default=224, help="Minimum shorter-side in pixels")
    args = ap.parse_args()

    dest_root = pathlib.Path(args.dest)
    for c in CLASSES:
        ensure_dir(dest_root / c)

    total = fetch_from_realwaste(dest_root, args.per_class, args.min_side)

    print(f"[TOTAL] Added -> {human_count(total)}")
    print(f"Images are saved under: {dest_root}")
    print("Next: merge with src\\merge_external_into_dataset.py")

if __name__ == "__main__":
    main()