import os import shutil import cv2 from PIL import Image import torch from facenet_pytorch import MTCNN from tqdm import tqdm device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Loading MTCNN onto {device}...") mtcnn = MTCNN(keep_all=False, device=device) # Base directories BASE_VAL_DIR = "dataset/validation" PROCESSED_TRAIN_DIR = "dataset/processed_train" PROCESSED_VAL_DIR = "dataset/processed_val" # Source directories CIFAKE_TRAIN = "dataset/train" CIFAKE_TEST = "dataset/test" DEEPFAKE_DIR = "dataset/flickr_deepfake" for p in [PROCESSED_TRAIN_DIR, PROCESSED_VAL_DIR]: os.makedirs(os.path.join(p, 'real'), exist_ok=True) os.makedirs(os.path.join(p, 'fake'), exist_ok=True) def process_real_faces(src_dir, dest_dir): """ This will be used for your new REAL dataset (e.g., FFHQ). We detect faces using NVIDIA-accelerated MTCNN and crop them for training. """ print(f"Processing REAL images from {src_dir} to {dest_dir}...") if not os.path.exists(src_dir): print(f"Warning: {src_dir} not found. Skipping.") return for filename in tqdm(os.listdir(src_dir), desc="Extracting REAL Faces"): if not filename.lower().endswith(('.png', '.jpg', '.jpeg')): continue src_path = os.path.join(src_dir, filename) dest_path = os.path.join(dest_dir, 'real', f"real_{filename}") try: img = Image.open(src_path).convert('RGB') img_cropped = mtcnn(img, save_path=None) if img_cropped is not None: mtcnn(img, save_path=dest_path) else: img = img.resize((224, 224)) img.save(dest_path) except Exception as e: print(f"Error extracting face from {filename}: {e}") def process_deepfake(src_dir, dest_dir): """ flickr_deepfake images are full resolution with Real and Fake mixed. We detect faces using NVIDIA-accelerated MTCNN, crop them, and sort by checking if the filename implies a face-swap (e.g. contains an underscore). """ print(f"Processing Deepfake (MTCNN Face Extraction) from {src_dir} to {dest_dir}...") if not os.path.exists(src_dir): print(f"Warning: {src_dir} not found. Skipping.") return for filename in tqdm(os.listdir(src_dir), desc="Extracting Faces"): if not filename.lower().endswith(('.png', '.jpg', '.jpeg')): continue src_path = os.path.join(src_dir, filename) # Heuristic: If ID_ID.jpg format (has underscore), it is fake. Else real. if '_' in filename: dest_label = 'fake' else: dest_label = 'real' dest_path = os.path.join(dest_dir, dest_label, f"df_{filename}") try: img = Image.open(src_path).convert('RGB') # MTCNN cropping img_cropped = mtcnn(img, save_path=None) if img_cropped is not None: # MTCNN returns a tensor (C, H, W) normalized [-1, 1] if save_path is None # We can just extract the bounding box manually so we have PIL logic, # but facenet_pytorch allows direct saving if we pass save_path. mtcnn(img, save_path=dest_path) else: # Fallback if MTCNN fails finding a face img = img.resize((224, 224)) img.save(dest_path) except Exception as e: print(f"Error extracting face from {filename}: {e}") if __name__ == "__main__": print("=== EXTRACTING REAL FFHQ FACES ===") # We successfully extracted the FAKE dataset, now we do the REAL one! process_real_faces("dataset/ffhq_real", PROCESSED_TRAIN_DIR) # process_deepfake(DEEPFAKE_DIR, PROCESSED_TRAIN_DIR) # Commented out to save you a 2.5 hour re-run! print("Real FFHQ Extraction Complete! All unified images are stored in dataset/processed_train.")