Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import argparse | |
| import random | |
| from pathlib import Path | |
| from PIL import Image | |
| from src.ai_image_detector.config import PROCESSED_DATA_DIR | |
| VALID_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".bmp"} | |
| def list_images(directory: Path) -> list[Path]: | |
| if not directory.exists(): | |
| raise FileNotFoundError(f"Folder not found: {directory}") | |
| files = [ | |
| path | |
| for path in directory.rglob("*") | |
| if path.is_file() and path.suffix.lower() in VALID_SUFFIXES | |
| ] | |
| if not files: | |
| raise ValueError(f"No valid images found in: {directory}") | |
| return sorted(files) | |
| def save_as_png(source: Path, destination: Path) -> None: | |
| destination.parent.mkdir(parents=True, exist_ok=True) | |
| with Image.open(source) as image: | |
| image.convert("RGB").save(destination, format="PNG") | |
| def remove_prefix_files(directory: Path, prefix: str) -> int: | |
| removed = 0 | |
| for path in directory.glob(f"{prefix}_*.png"): | |
| path.unlink() | |
| removed += 1 | |
| return removed | |
| def copy_subset( | |
| files: list[Path], | |
| destination_dir: Path, | |
| prefix: str, | |
| count: int, | |
| seed: int, | |
| ) -> int: | |
| rng = random.Random(seed) | |
| candidates = list(files) | |
| rng.shuffle(candidates) | |
| selected = candidates[:count] | |
| written = 0 | |
| for index, source in enumerate(selected): | |
| destination = destination_dir / f"{prefix}_{index:05d}_{source.stem}.png" | |
| save_as_png(source, destination) | |
| written += 1 | |
| return written | |
| def main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Add Gemini-focused fake images plus matched real images into training folders." | |
| ) | |
| parser.add_argument( | |
| "--gemini-dir", | |
| type=Path, | |
| required=True, | |
| help="Folder containing Gemini-generated images (fake).", | |
| ) | |
| parser.add_argument( | |
| "--real-dir", | |
| type=Path, | |
| required=True, | |
| help="Folder containing real photos for balancing.", | |
| ) | |
| parser.add_argument( | |
| "--gemini-count", | |
| type=int, | |
| default=700, | |
| help="How many Gemini fake images to add.", | |
| ) | |
| parser.add_argument( | |
| "--real-count", | |
| type=int, | |
| default=None, | |
| help="How many real images to add. Defaults to --gemini-count.", | |
| ) | |
| parser.add_argument( | |
| "--fake-prefix", | |
| type=str, | |
| default="gemini_fake", | |
| help="Filename prefix for copied Gemini fake images.", | |
| ) | |
| parser.add_argument( | |
| "--real-prefix", | |
| type=str, | |
| default="gemini_match_real", | |
| help="Filename prefix for copied real images.", | |
| ) | |
| parser.add_argument( | |
| "--seed", | |
| type=int, | |
| default=42, | |
| help="Random seed for subset sampling.", | |
| ) | |
| parser.add_argument( | |
| "--replace-prefix", | |
| action="store_true", | |
| help="Delete existing files with the same prefixes before copying new ones.", | |
| ) | |
| args = parser.parse_args() | |
| real_count = args.gemini_count if args.real_count is None else args.real_count | |
| fake_out_dir = PROCESSED_DATA_DIR / "fake" | |
| real_out_dir = PROCESSED_DATA_DIR / "real" | |
| fake_out_dir.mkdir(parents=True, exist_ok=True) | |
| real_out_dir.mkdir(parents=True, exist_ok=True) | |
| gemini_files = list_images(args.gemini_dir) | |
| real_files = list_images(args.real_dir) | |
| if len(gemini_files) < args.gemini_count: | |
| raise ValueError( | |
| f"Requested {args.gemini_count} Gemini images, but only {len(gemini_files)} found." | |
| ) | |
| if len(real_files) < real_count: | |
| raise ValueError( | |
| f"Requested {real_count} real images, but only {len(real_files)} found." | |
| ) | |
| if args.replace_prefix: | |
| removed_fake = remove_prefix_files(fake_out_dir, args.fake_prefix) | |
| removed_real = remove_prefix_files(real_out_dir, args.real_prefix) | |
| print(f"Removed {removed_fake} fake files with prefix '{args.fake_prefix}'.") | |
| print(f"Removed {removed_real} real files with prefix '{args.real_prefix}'.") | |
| added_fake = copy_subset( | |
| files=gemini_files, | |
| destination_dir=fake_out_dir, | |
| prefix=args.fake_prefix, | |
| count=args.gemini_count, | |
| seed=args.seed, | |
| ) | |
| added_real = copy_subset( | |
| files=real_files, | |
| destination_dir=real_out_dir, | |
| prefix=args.real_prefix, | |
| count=real_count, | |
| seed=args.seed + 11, | |
| ) | |
| print(f"Added {added_fake} Gemini fake images to {fake_out_dir}") | |
| print(f"Added {added_real} real images to {real_out_dir}") | |
| print("Now run: python train.py") | |
| if __name__ == "__main__": | |
| main() | |