import os
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
from facenet_pytorch import InceptionResnetV1, MTCNN
from transformers import CLIPProcessor, CLIPModel
import albumentations as A
import cv2

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Using device: {device}")

# Initialize models
mtcnn = MTCNN(image_size=160, device=device)
facenet = InceptionResnetV1(pretrained='vggface2').eval().to(device)

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Input data folders
DATA_DIR = "data"
CATEGORIES = ["real", "deepfake", "ai_gen"]

# Output path
os.makedirs("features", exist_ok=True)

# Data augmentation pipeline
augment = A.Compose([
    A.RandomBrightnessContrast(p=0.2),
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=10, p=0.3),
    A.MotionBlur(p=0.2),
    A.Resize(160, 160),  # For MTCNN size requirement
])

def extract_facenet_features(img_path):
    image = Image.open(img_path).convert("RGB")
    
    # Resize image before passing it to MTCNN
    img_np = np.array(image)
    img_resized = cv2.resize(img_np, (160, 160))  # Resize image to 160x160

    # Apply augmentation
    augmented = augment(image=img_resized)["image"]
    img_aug = Image.fromarray(augmented)

    # Face detection using MTCNN
    face = mtcnn(img_aug)
    if face is None:
        print(f"[WARN] No face detected in {img_path}")
        return None
    face = face.unsqueeze(0).to(device)

    # Feature extraction using FaceNet
    with torch.no_grad():
        face_emb = facenet(face)
    
    return face_emb.squeeze().cpu().numpy()

def extract_clip_features(img_path):
    image = Image.open(img_path).convert("RGB")
    
    # Apply the same augmentation to the image before passing to CLIP
    img_np = np.array(image)
    augmented = augment(image=img_np)["image"]
    img_aug = Image.fromarray(augmented)

    # Extract features using CLIP
    inputs = clip_processor(images=img_aug, return_tensors="pt").to(device)
    with torch.no_grad():
        clip_outputs = clip_model.get_image_features(**inputs)
    
    return clip_outputs.cpu().numpy().squeeze()

def extract_combined_features(img_path):
    # Extract features from both FaceNet and CLIP
    facenet_features = extract_facenet_features(img_path)
    clip_features = extract_clip_features(img_path)
    
    if facenet_features is None:
        return None
    
    # Combine (concatenate) the features from FaceNet and CLIP
    combined_features = np.concatenate((facenet_features, clip_features))
    return combined_features

def extract_all_features():
    X, y = [], []
    for label, category in enumerate(CATEGORIES):
        folder = os.path.join(DATA_DIR, category)
        if not os.path.isdir(folder):
            print(f"[WARN] Missing folder: {folder}")
            continue

        print(f"\n🧠 Extracting from: {category} ({folder})")
        for fname in tqdm(os.listdir(folder)):
            if not fname.lower().endswith((".jpg", ".jpeg", ".png")):
                continue
            path = os.path.join(folder, fname)
            combined_features = extract_combined_features(path)
            if combined_features is not None:
                X.append(combined_features)
                y.append(label)

    # Save the extracted features
    np.save("../features/embeddings.npy", np.array(X))
    np.save("../features/labels.npy", np.array(y))
    print(f"\n✅ Done: Saved {len(X)} embeddings.")

if __name__ == "__main__":
    extract_all_features()