""" scripts/prepare_cifake.py Prepares the CIFAKE dataset (Kaggle: bird-coder/cifake-real-and-ai-generated-synthetic-images) for use in the fingerprint engine training pipeline. CIFAKE contains 60k real images (CIFAR-10) and 60k AI-generated equivalents. Useful as extra training data for the fingerprint engine. Kaggle usage: !python scripts/prepare_cifake.py \ --source /kaggle/input/cifake-real-and-ai-generated-synthetic-images \ --output /kaggle/working/processed/fingerprint \ --max_per_class 20000 """ from __future__ import annotations import argparse import logging import random import shutil from pathlib import Path logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") log = logging.getLogger(__name__) IMG_EXTS = {".jpg", ".jpeg", ".png"} def main(args: argparse.Namespace) -> None: source = Path(args.source) output = Path(args.output) rng = random.Random(args.seed) if not source.exists(): log.error(f"Source not found: {source}") return for split in ["train", "test"]: for label, is_fake in [("REAL", "real"), ("FAKE", "fake")]: src_dir = source / split / label if not src_dir.exists(): src_dir = source / label if not src_dir.exists(): log.warning(f" Not found: {src_dir}") continue imgs = [p for p in src_dir.iterdir() if p.suffix.lower() in IMG_EXTS] rng.shuffle(imgs) imgs = imgs[:args.max_per_class] out_split = "train" if split == "train" else "val" dst_dir = output / out_split / is_fake dst_dir.mkdir(parents=True, exist_ok=True) for img in imgs: dst = dst_dir / f"cifake_{img.name}" if not dst.exists(): shutil.copy2(img, dst) log.info(f" cifake/{split}/{label} → {out_split}/{is_fake}: {len(imgs)} images") log.info("CIFAKE preparation complete.") def parse_args(): p = argparse.ArgumentParser() p.add_argument("--source", default="/kaggle/input/cifake-real-and-ai-generated-synthetic-images") p.add_argument("--output", default="/kaggle/working/processed/fingerprint") p.add_argument("--max_per_class", type=int, default=20000) p.add_argument("--seed", type=int, default=42) return p.parse_args() if __name__ == "__main__": main(parse_args())