import torch from transformers import AutoImageProcessor, AutoModelForImageClassification from PIL import Image from facenet_pytorch import MTCNN from torchvision import transforms import timm device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Evaluating on:", device) mtcnn = MTCNN(keep_all=False, device=device) # --- Load ViT --- model_id = "prithivMLmods/Deep-Fake-Detector-v2-Model" processor = AutoImageProcessor.from_pretrained(model_id) vit_model = AutoModelForImageClassification.from_pretrained(model_id).to(device) vit_model.eval() # --- Load Local EfficientNet --- eff_model = timm.create_model('efficientnet_b3', pretrained=False, num_classes=2) eff_model.load_state_dict(torch.load("model_best.pth", map_location=device)) eff_model.to(device) eff_model.eval() eff_transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) def debug_ensemble(image_path): print(f"\n======================================") print(f"DEBUGGING INFERENCE OVER: {image_path}") print(f"======================================") image = Image.open(image_path).convert("RGB") # Defaults vit_image = image # Default to raw image eff_image = image # Default to raw image boxes, probs = mtcnn.detect(image) if boxes is not None and len(boxes) > 0: box = boxes[0] # 1. Zero-Margin Crop (Exactly matches how extract_faces.py generated the EfficientNet training dataset) bleft, btop, bright, bbottom = box[0], box[1], box[2], box[3] eff_image = image.crop((int(bleft), int(btop), int(bright), int(bbottom))) # 2. 15% Padded Crop (for ViT, if needed) w, h = box[2] - box[0], box[3] - box[1] b1, b2 = max(0, box[0] - w * 0.15), max(0, box[1] - h * 0.15) b3, b4 = min(image.width, box[2] + w * 0.15), min(image.height, box[3] + h * 0.15) vit_image = image.crop((int(b1), int(b2), int(b3), int(b4))) else: print(">> WARNING: MTCNN found no face! Both models falling back to raw image.") print("\n--- 1. CLOUD ViT DIAGNOSTIC ---") inputs = processor(images=vit_image, return_tensors="pt").to(device) with torch.no_grad(): outputs = vit_model(**inputs) model_probs = torch.nn.functional.softmax(outputs.logits, dim=1) labels = vit_model.config.id2label vit_fake_prob = 0.0 vit_real_prob = 0.0 for idx, label_name in labels.items(): prob = model_probs[0][idx].item() l = label_name.lower() print(f">> IDX [{idx}] ({label_name}): {prob * 100:.2f}%") if 'fake' in l or 'deepfake' in l or 'spoof' in l: vit_fake_prob += prob elif 'real' in l or 'pristine' in l: vit_real_prob += prob print(f">> ViT Final Fake Score: {vit_fake_prob * 100:.2f}%") print(f">> ViT Final Real Score: {vit_real_prob * 100:.2f}%") print("\n--- 2. LOCAL EFFICIENTNET DIAGNOSTIC ---") eff_input = eff_transform(eff_image).unsqueeze(0).to(device) with torch.no_grad(): eff_outputs = eff_model(eff_input) eff_probs = torch.nn.functional.softmax(eff_outputs, dim=1) eff_fake_prob = eff_probs[0][0].item() # 0 is Fake eff_real_prob = eff_probs[0][1].item() # 1 is Real print(f">> Local Model FAKE (IDX 0): {eff_fake_prob * 100:.2f}%") print(f">> Local Model REAL (IDX 1): {eff_real_prob * 100:.2f}%") print("\n--- 3. MATHEMATICAL ENSEMBLE ---") # Current app.py weighting final_fake_prob = (0.7 * vit_fake_prob) + (0.3 * eff_fake_prob) final_real_prob = (0.7 * vit_real_prob) + (0.3 * eff_real_prob) print(f"> Current Output [70% ViT / 30% Eff]: FAKE={final_fake_prob*100:.2f}%, REAL={final_real_prob*100:.2f}%") # Proposed balanced weighting bal_fake = (0.5 * vit_fake_prob) + (0.5 * eff_fake_prob) bal_real = (0.5 * vit_real_prob) + (0.5 * eff_real_prob) print(f"> Balanced Output [50% ViT / 50% Eff]: FAKE={bal_fake*100:.2f}%, REAL={bal_real*100:.2f}%") if __name__ == "__main__": import sys if len(sys.argv) > 1: debug_ensemble(sys.argv[1]) else: print("Please provide an image path: python test_debug.py ")