SDK-Docker / add_gemini_data.py
Lucifer9907's picture
Prepare Hugging Face Docker Space
ff0c419
from __future__ import annotations
import argparse
import random
from pathlib import Path
from PIL import Image
from src.ai_image_detector.config import PROCESSED_DATA_DIR
VALID_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
def list_images(directory: Path) -> list[Path]:
if not directory.exists():
raise FileNotFoundError(f"Folder not found: {directory}")
files = [
path
for path in directory.rglob("*")
if path.is_file() and path.suffix.lower() in VALID_SUFFIXES
]
if not files:
raise ValueError(f"No valid images found in: {directory}")
return sorted(files)
def save_as_png(source: Path, destination: Path) -> None:
destination.parent.mkdir(parents=True, exist_ok=True)
with Image.open(source) as image:
image.convert("RGB").save(destination, format="PNG")
def remove_prefix_files(directory: Path, prefix: str) -> int:
removed = 0
for path in directory.glob(f"{prefix}_*.png"):
path.unlink()
removed += 1
return removed
def copy_subset(
files: list[Path],
destination_dir: Path,
prefix: str,
count: int,
seed: int,
) -> int:
rng = random.Random(seed)
candidates = list(files)
rng.shuffle(candidates)
selected = candidates[:count]
written = 0
for index, source in enumerate(selected):
destination = destination_dir / f"{prefix}_{index:05d}_{source.stem}.png"
save_as_png(source, destination)
written += 1
return written
def main() -> None:
parser = argparse.ArgumentParser(
description="Add Gemini-focused fake images plus matched real images into training folders."
)
parser.add_argument(
"--gemini-dir",
type=Path,
required=True,
help="Folder containing Gemini-generated images (fake).",
)
parser.add_argument(
"--real-dir",
type=Path,
required=True,
help="Folder containing real photos for balancing.",
)
parser.add_argument(
"--gemini-count",
type=int,
default=700,
help="How many Gemini fake images to add.",
)
parser.add_argument(
"--real-count",
type=int,
default=None,
help="How many real images to add. Defaults to --gemini-count.",
)
parser.add_argument(
"--fake-prefix",
type=str,
default="gemini_fake",
help="Filename prefix for copied Gemini fake images.",
)
parser.add_argument(
"--real-prefix",
type=str,
default="gemini_match_real",
help="Filename prefix for copied real images.",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed for subset sampling.",
)
parser.add_argument(
"--replace-prefix",
action="store_true",
help="Delete existing files with the same prefixes before copying new ones.",
)
args = parser.parse_args()
real_count = args.gemini_count if args.real_count is None else args.real_count
fake_out_dir = PROCESSED_DATA_DIR / "fake"
real_out_dir = PROCESSED_DATA_DIR / "real"
fake_out_dir.mkdir(parents=True, exist_ok=True)
real_out_dir.mkdir(parents=True, exist_ok=True)
gemini_files = list_images(args.gemini_dir)
real_files = list_images(args.real_dir)
if len(gemini_files) < args.gemini_count:
raise ValueError(
f"Requested {args.gemini_count} Gemini images, but only {len(gemini_files)} found."
)
if len(real_files) < real_count:
raise ValueError(
f"Requested {real_count} real images, but only {len(real_files)} found."
)
if args.replace_prefix:
removed_fake = remove_prefix_files(fake_out_dir, args.fake_prefix)
removed_real = remove_prefix_files(real_out_dir, args.real_prefix)
print(f"Removed {removed_fake} fake files with prefix '{args.fake_prefix}'.")
print(f"Removed {removed_real} real files with prefix '{args.real_prefix}'.")
added_fake = copy_subset(
files=gemini_files,
destination_dir=fake_out_dir,
prefix=args.fake_prefix,
count=args.gemini_count,
seed=args.seed,
)
added_real = copy_subset(
files=real_files,
destination_dir=real_out_dir,
prefix=args.real_prefix,
count=real_count,
seed=args.seed + 11,
)
print(f"Added {added_fake} Gemini fake images to {fake_out_dir}")
print(f"Added {added_real} real images to {real_out_dir}")
print("Now run: python train.py")
if __name__ == "__main__":
main()