import torch import cv2 import numpy as np import torchvision.transforms as T from collections import OrderedDict import base64 from model import DeepfakeEffNetTransformer from cam import GradCAM, overlay_heatmap device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # LOAD MODEL model = DeepfakeEffNetTransformer() state_dict = torch.load( "best_model.pth", map_location="cpu" ) new_state = OrderedDict() for k, v in state_dict.items(): name = k.replace("module.", "") new_state[name] = v model.load_state_dict(new_state) model = model.to(device) model.eval() print("Model loaded") # GRADCAM TARGET LAYER target_layer = model.cnn.blocks[-1] grad_cam = GradCAM(model, target_layer) # FACE DETECTOR face_detector = cv2.CascadeClassifier( cv2.data.haarcascades + "haarcascade_frontalface_default.xml" ) # FRAME CACHE LAST_FRAMES = [] # FRAME EXTRACTION def extract_and_crop(video_path, num_frames=32): cap = cv2.VideoCapture(video_path) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) idx = np.linspace(0, total_frames - 1, num_frames).astype(int) frames = [] for i in idx: cap.set(cv2.CAP_PROP_POS_FRAMES, i) ret, frame = cap.read() if not ret: continue gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) faces = face_detector.detectMultiScale( gray, scaleFactor=1.3, minNeighbors=5 ) if len(faces) > 0: x, y, w, h = faces[0] face = frame[y:y+h, x:x+w] else: face = frame face = cv2.resize(face, (240,240)) face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB) frames.append(face) cap.release() return frames # TRANSFORM transform = T.Compose([ T.ToPILImage(), T.Resize((240,240)), T.ToTensor(), T.Normalize([0.5]*3,[0.5]*3) ]) # INFERENCE def run_inference(video_path): global LAST_FRAMES frames = extract_and_crop(video_path) LAST_FRAMES = frames if len(frames) == 0: return { "label": "Video tidak terbaca", "confidence": 0, "frames": [] } imgs = [] for f in frames: img = transform(f) imgs.append(img) imgs = torch.stack(imgs).unsqueeze(0).to(device) with torch.no_grad(): outputs = model(imgs) probs = torch.softmax(outputs, dim=1)[0] pred = torch.argmax(probs).item() confidence = probs[pred].item() * 100 label = "Real" if pred == 0 else "Fake" encoded_frames = [] for f in frames: _, buffer = cv2.imencode( ".jpg", cv2.cvtColor(f, cv2.COLOR_RGB2BGR) ) encoded_frames.append( base64.b64encode(buffer).decode("utf-8") ) return { "label": label, "confidence": confidence, "frames": encoded_frames } # REGION IMPORTANCE def compute_regions(cam): regions = {} regions["Forehead"] = cam[0:60, :].mean() regions["Eyes"] = cam[60:110, :].mean() regions["Cheeks"] = cam[110:170, :].mean() regions["Mouth"] = cam[170:220, :].mean() regions["Chin"] = cam[220:240, :].mean() total = sum(regions.values()) + 1e-8 result = [] for k,v in regions.items(): result.append({ "name": k, "value": float(v / total) }) return result # HEATMAP GENERATION def generate_heatmap(frame_index): global LAST_FRAMES if frame_index >= len(LAST_FRAMES): return None, None frame = LAST_FRAMES[frame_index] img = transform(frame) seq = torch.stack([img] * 32) seq = seq.unsqueeze(0).to(device) cam = grad_cam.generate(seq) regions = compute_regions(cam) heatmap = overlay_heatmap( cv2.cvtColor(frame, cv2.COLOR_RGB2BGR), cam ) return heatmap, regions