Spaces:

Aditya150
/

Veritas-AI

Sleeping

Veritas-AI / extract_faces.py

Aditya-Jadhav150

Initial commit: Veritas-AI Production Build

239017e 2 months ago

4.03 kB

	import os
	import shutil
	import cv2
	from PIL import Image
	import torch
	from facenet_pytorch import MTCNN
	from tqdm import tqdm

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"Loading MTCNN onto {device}...")
	mtcnn = MTCNN(keep_all=False, device=device)

	# Base directories
	BASE_VAL_DIR = "dataset/validation"
	PROCESSED_TRAIN_DIR = "dataset/processed_train"
	PROCESSED_VAL_DIR = "dataset/processed_val"

	# Source directories
	CIFAKE_TRAIN = "dataset/train"
	CIFAKE_TEST = "dataset/test"
	DEEPFAKE_DIR = "dataset/flickr_deepfake"

	for p in [PROCESSED_TRAIN_DIR, PROCESSED_VAL_DIR]:
	os.makedirs(os.path.join(p, 'real'), exist_ok=True)
	os.makedirs(os.path.join(p, 'fake'), exist_ok=True)

	def process_real_faces(src_dir, dest_dir):
	"""
	This will be used for your new REAL dataset (e.g., FFHQ).
	We detect faces using NVIDIA-accelerated MTCNN and crop them for training.
	"""
	print(f"Processing REAL images from {src_dir} to {dest_dir}...")
	if not os.path.exists(src_dir):
	print(f"Warning: {src_dir} not found. Skipping.")
	return

	for filename in tqdm(os.listdir(src_dir), desc="Extracting REAL Faces"):
	if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
	continue

	src_path = os.path.join(src_dir, filename)
	dest_path = os.path.join(dest_dir, 'real', f"real_{filename}")

	try:
	img = Image.open(src_path).convert('RGB')
	img_cropped = mtcnn(img, save_path=None)

	if img_cropped is not None:
	mtcnn(img, save_path=dest_path)
	else:
	img = img.resize((224, 224))
	img.save(dest_path)
	except Exception as e:
	print(f"Error extracting face from {filename}: {e}")

	def process_deepfake(src_dir, dest_dir):
	"""
	flickr_deepfake images are full resolution with Real and Fake mixed.
	We detect faces using NVIDIA-accelerated MTCNN, crop them, and sort
	by checking if the filename implies a face-swap (e.g. contains an underscore).
	"""
	print(f"Processing Deepfake (MTCNN Face Extraction) from {src_dir} to {dest_dir}...")
	if not os.path.exists(src_dir):
	print(f"Warning: {src_dir} not found. Skipping.")
	return

	for filename in tqdm(os.listdir(src_dir), desc="Extracting Faces"):
	if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
	continue

	src_path = os.path.join(src_dir, filename)

	# Heuristic: If ID_ID.jpg format (has underscore), it is fake. Else real.
	if '_' in filename:
	dest_label = 'fake'
	else:
	dest_label = 'real'

	dest_path = os.path.join(dest_dir, dest_label, f"df_{filename}")

	try:
	img = Image.open(src_path).convert('RGB')
	# MTCNN cropping
	img_cropped = mtcnn(img, save_path=None)

	if img_cropped is not None:
	# MTCNN returns a tensor (C, H, W) normalized [-1, 1] if save_path is None
	# We can just extract the bounding box manually so we have PIL logic,
	# but facenet_pytorch allows direct saving if we pass save_path.
	mtcnn(img, save_path=dest_path)
	else:
	# Fallback if MTCNN fails finding a face
	img = img.resize((224, 224))
	img.save(dest_path)
	except Exception as e:
	print(f"Error extracting face from {filename}: {e}")

	if __name__ == "__main__":
	print("=== EXTRACTING REAL FFHQ FACES ===")

	# We successfully extracted the FAKE dataset, now we do the REAL one!
	process_real_faces("dataset/ffhq_real", PROCESSED_TRAIN_DIR)

	# process_deepfake(DEEPFAKE_DIR, PROCESSED_TRAIN_DIR) # Commented out to save you a 2.5 hour re-run!

	print("Real FFHQ Extraction Complete! All unified images are stored in dataset/processed_train.")