""" scripts/prepare_ifakefakedb.py Prepares the iFakeFaceDB dataset for fingerprint engine training. Kaggle slug: tapakah68/artificial-faces-dataset or similar. iFakeFaceDB contains ~87k StyleGAN-generated fake faces, useful for increasing unknown_gan class coverage. Kaggle usage: !python scripts/prepare_ifakefakedb.py \ --source /kaggle/input/artificial-faces-dataset \ --output /kaggle/working/processed/fingerprint \ --max 20000 """ from __future__ import annotations import argparse import logging import random import shutil from pathlib import Path logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") log = logging.getLogger(__name__) IMG_EXTS = {".jpg", ".jpeg", ".png"} def main(args: argparse.Namespace) -> None: source = Path(args.source) if not source.exists(): log.error(f"Source not found: {source}") return rng = random.Random(args.seed) imgs = [p for p in source.rglob("*") if p.suffix.lower() in IMG_EXTS] rng.shuffle(imgs) imgs = imgs[:args.max] n_train = int(len(imgs) * 0.85) splits = {"train": imgs[:n_train], "val": imgs[n_train:]} for split, subset in splits.items(): dst_dir = Path(args.output) / split / "fake" dst_dir.mkdir(parents=True, exist_ok=True) for img in subset: dst = dst_dir / f"ifake_{img.name}" if not dst.exists(): shutil.copy2(img, dst) log.info(f" {split}/fake: {len(subset)} images (generator: unknown_gan / StyleGAN)") log.info("iFakeFaceDB preparation complete.") def parse_args(): p = argparse.ArgumentParser() p.add_argument("--source", default="/kaggle/input/artificial-faces-dataset") p.add_argument("--output", default="/kaggle/working/processed/fingerprint") p.add_argument("--max", type=int, default=20000) p.add_argument("--seed", type=int, default=42) return p.parse_args() if __name__ == "__main__": main(parse_args())