Spaces:
Sleeping
Sleeping
| import os | |
| import shutil | |
| import cv2 | |
| from PIL import Image | |
| import torch | |
| from facenet_pytorch import MTCNN | |
| from tqdm import tqdm | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Loading MTCNN onto {device}...") | |
| mtcnn = MTCNN(keep_all=False, device=device) | |
| # Base directories | |
| BASE_VAL_DIR = "dataset/validation" | |
| PROCESSED_TRAIN_DIR = "dataset/processed_train" | |
| PROCESSED_VAL_DIR = "dataset/processed_val" | |
| # Source directories | |
| CIFAKE_TRAIN = "dataset/train" | |
| CIFAKE_TEST = "dataset/test" | |
| DEEPFAKE_DIR = "dataset/flickr_deepfake" | |
| for p in [PROCESSED_TRAIN_DIR, PROCESSED_VAL_DIR]: | |
| os.makedirs(os.path.join(p, 'real'), exist_ok=True) | |
| os.makedirs(os.path.join(p, 'fake'), exist_ok=True) | |
| def process_real_faces(src_dir, dest_dir): | |
| """ | |
| This will be used for your new REAL dataset (e.g., FFHQ). | |
| We detect faces using NVIDIA-accelerated MTCNN and crop them for training. | |
| """ | |
| print(f"Processing REAL images from {src_dir} to {dest_dir}...") | |
| if not os.path.exists(src_dir): | |
| print(f"Warning: {src_dir} not found. Skipping.") | |
| return | |
| for filename in tqdm(os.listdir(src_dir), desc="Extracting REAL Faces"): | |
| if not filename.lower().endswith(('.png', '.jpg', '.jpeg')): | |
| continue | |
| src_path = os.path.join(src_dir, filename) | |
| dest_path = os.path.join(dest_dir, 'real', f"real_{filename}") | |
| try: | |
| img = Image.open(src_path).convert('RGB') | |
| img_cropped = mtcnn(img, save_path=None) | |
| if img_cropped is not None: | |
| mtcnn(img, save_path=dest_path) | |
| else: | |
| img = img.resize((224, 224)) | |
| img.save(dest_path) | |
| except Exception as e: | |
| print(f"Error extracting face from {filename}: {e}") | |
| def process_deepfake(src_dir, dest_dir): | |
| """ | |
| flickr_deepfake images are full resolution with Real and Fake mixed. | |
| We detect faces using NVIDIA-accelerated MTCNN, crop them, and sort | |
| by checking if the filename implies a face-swap (e.g. contains an underscore). | |
| """ | |
| print(f"Processing Deepfake (MTCNN Face Extraction) from {src_dir} to {dest_dir}...") | |
| if not os.path.exists(src_dir): | |
| print(f"Warning: {src_dir} not found. Skipping.") | |
| return | |
| for filename in tqdm(os.listdir(src_dir), desc="Extracting Faces"): | |
| if not filename.lower().endswith(('.png', '.jpg', '.jpeg')): | |
| continue | |
| src_path = os.path.join(src_dir, filename) | |
| # Heuristic: If ID_ID.jpg format (has underscore), it is fake. Else real. | |
| if '_' in filename: | |
| dest_label = 'fake' | |
| else: | |
| dest_label = 'real' | |
| dest_path = os.path.join(dest_dir, dest_label, f"df_{filename}") | |
| try: | |
| img = Image.open(src_path).convert('RGB') | |
| # MTCNN cropping | |
| img_cropped = mtcnn(img, save_path=None) | |
| if img_cropped is not None: | |
| # MTCNN returns a tensor (C, H, W) normalized [-1, 1] if save_path is None | |
| # We can just extract the bounding box manually so we have PIL logic, | |
| # but facenet_pytorch allows direct saving if we pass save_path. | |
| mtcnn(img, save_path=dest_path) | |
| else: | |
| # Fallback if MTCNN fails finding a face | |
| img = img.resize((224, 224)) | |
| img.save(dest_path) | |
| except Exception as e: | |
| print(f"Error extracting face from {filename}: {e}") | |
| if __name__ == "__main__": | |
| print("=== EXTRACTING REAL FFHQ FACES ===") | |
| # We successfully extracted the FAKE dataset, now we do the REAL one! | |
| process_real_faces("dataset/ffhq_real", PROCESSED_TRAIN_DIR) | |
| # process_deepfake(DEEPFAKE_DIR, PROCESSED_TRAIN_DIR) # Commented out to save you a 2.5 hour re-run! | |
| print("Real FFHQ Extraction Complete! All unified images are stored in dataset/processed_train.") | |