SDK-Docker / curate_hard_examples.py
Lucifer9907's picture
Prepare Hugging Face Docker Space
ff0c419
from __future__ import annotations
import argparse
import shutil
from pathlib import Path
import cv2
import numpy as np
import tensorflow as tf
from src.ai_image_detector.data import preprocess_image
VALID_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
def iter_images(folder: Path) -> list[Path]:
return sorted(
p for p in folder.iterdir() if p.is_file() and p.suffix.lower() in VALID_SUFFIXES
)
def score_images(model: tf.keras.Model, files: list[Path]) -> list[tuple[Path, float]]:
scored: list[tuple[Path, float]] = []
for file_path in files:
image = cv2.imread(str(file_path))
if image is None:
continue
x = preprocess_image(image)
pred = float(model.predict(np.expand_dims(x, axis=0), verbose=0)[0][0])
scored.append((file_path, pred))
return scored
def clear_previous_curated(processed_real: Path, processed_fake: Path) -> None:
for p in processed_real.glob("hard_real_*"):
p.unlink(missing_ok=True)
for p in processed_fake.glob("hard_fake_*"):
p.unlink(missing_ok=True)
def copy_selected(
selected_real: list[tuple[Path, float]],
selected_fake: list[tuple[Path, float]],
processed_real: Path,
processed_fake: Path,
) -> None:
for i, (src, score) in enumerate(selected_real):
dst = processed_real / f"hard_real_{i:04d}_{score:.4f}{src.suffix.lower()}"
shutil.copy2(src, dst)
for i, (src, score) in enumerate(selected_fake):
dst = processed_fake / f"hard_fake_{i:04d}_{score:.4f}{src.suffix.lower()}"
shutil.copy2(src, dst)
def main() -> None:
parser = argparse.ArgumentParser(description="Mine hard holdout examples into training.")
parser.add_argument(
"--project-root",
type=Path,
default=Path(r"E:\ML mini-project"),
)
parser.add_argument(
"--per-class",
type=int,
default=300,
help="How many hard examples to add per class",
)
args = parser.parse_args()
project_root = args.project_root
model_path = project_root / "artifacts" / "ai_image_detector.keras"
holdout_real = project_root / "data" / "holdout_newdb" / "real"
holdout_fake = project_root / "data" / "holdout_newdb" / "fake"
processed_real = project_root / "data" / "processed" / "real"
processed_fake = project_root / "data" / "processed" / "fake"
if not model_path.exists():
raise FileNotFoundError(f"Model not found: {model_path}")
model = tf.keras.models.load_model(model_path)
real_scored = score_images(model, iter_images(holdout_real))
fake_scored = score_images(model, iter_images(holdout_fake))
# Hard real: model thinks fake (highest AI probability).
hard_real = sorted(real_scored, key=lambda x: x[1], reverse=True)[: args.per_class]
# Hard fake: model thinks real (lowest AI probability).
hard_fake = sorted(fake_scored, key=lambda x: x[1])[: args.per_class]
clear_previous_curated(processed_real, processed_fake)
copy_selected(hard_real, hard_fake, processed_real, processed_fake)
print(f"Added hard real examples: {len(hard_real)}")
print(f"Added hard fake examples: {len(hard_fake)}")
if hard_real:
print(f"Hard real score range: {hard_real[-1][1]:.4f} -> {hard_real[0][1]:.4f}")
if hard_fake:
print(f"Hard fake score range: {hard_fake[0][1]:.4f} -> {hard_fake[-1][1]:.4f}")
if __name__ == "__main__":
main()