Spaces:
Running
Running
| """ | |
| scripts/prepare_cifake.py | |
| Prepares the CIFAKE dataset (Kaggle: bird-coder/cifake-real-and-ai-generated-synthetic-images) | |
| for use in the fingerprint engine training pipeline. | |
| CIFAKE contains 60k real images (CIFAR-10) and 60k AI-generated equivalents. | |
| Useful as extra training data for the fingerprint engine. | |
| Kaggle usage: | |
| !python scripts/prepare_cifake.py \ | |
| --source /kaggle/input/cifake-real-and-ai-generated-synthetic-images \ | |
| --output /kaggle/working/processed/fingerprint \ | |
| --max_per_class 20000 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import logging | |
| import random | |
| import shutil | |
| from pathlib import Path | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") | |
| log = logging.getLogger(__name__) | |
| IMG_EXTS = {".jpg", ".jpeg", ".png"} | |
| def main(args: argparse.Namespace) -> None: | |
| source = Path(args.source) | |
| output = Path(args.output) | |
| rng = random.Random(args.seed) | |
| if not source.exists(): | |
| log.error(f"Source not found: {source}") | |
| return | |
| for split in ["train", "test"]: | |
| for label, is_fake in [("REAL", "real"), ("FAKE", "fake")]: | |
| src_dir = source / split / label | |
| if not src_dir.exists(): | |
| src_dir = source / label | |
| if not src_dir.exists(): | |
| log.warning(f" Not found: {src_dir}") | |
| continue | |
| imgs = [p for p in src_dir.iterdir() if p.suffix.lower() in IMG_EXTS] | |
| rng.shuffle(imgs) | |
| imgs = imgs[:args.max_per_class] | |
| out_split = "train" if split == "train" else "val" | |
| dst_dir = output / out_split / is_fake | |
| dst_dir.mkdir(parents=True, exist_ok=True) | |
| for img in imgs: | |
| dst = dst_dir / f"cifake_{img.name}" | |
| if not dst.exists(): | |
| shutil.copy2(img, dst) | |
| log.info(f" cifake/{split}/{label} → {out_split}/{is_fake}: {len(imgs)} images") | |
| log.info("CIFAKE preparation complete.") | |
| def parse_args(): | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--source", default="/kaggle/input/cifake-real-and-ai-generated-synthetic-images") | |
| p.add_argument("--output", default="/kaggle/working/processed/fingerprint") | |
| p.add_argument("--max_per_class", type=int, default=20000) | |
| p.add_argument("--seed", type=int, default=42) | |
| return p.parse_args() | |
| if __name__ == "__main__": | |
| main(parse_args()) | |