Spaces:
Sleeping
Sleeping
| import torch | |
| import cv2 | |
| import numpy as np | |
| import torchvision.transforms as T | |
| from collections import OrderedDict | |
| import base64 | |
| from model import DeepfakeEffNetTransformer | |
| from cam import GradCAM, overlay_heatmap | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # LOAD MODEL | |
| model = DeepfakeEffNetTransformer() | |
| state_dict = torch.load( | |
| "best_model.pth", | |
| map_location="cpu" | |
| ) | |
| new_state = OrderedDict() | |
| for k, v in state_dict.items(): | |
| name = k.replace("module.", "") | |
| new_state[name] = v | |
| model.load_state_dict(new_state) | |
| model = model.to(device) | |
| model.eval() | |
| print("Model loaded") | |
| # GRADCAM TARGET LAYER | |
| target_layer = model.cnn.blocks[-1] | |
| grad_cam = GradCAM(model, target_layer) | |
| # FACE DETECTOR | |
| face_detector = cv2.CascadeClassifier( | |
| cv2.data.haarcascades + | |
| "haarcascade_frontalface_default.xml" | |
| ) | |
| # FRAME CACHE | |
| LAST_FRAMES = [] | |
| # FRAME EXTRACTION | |
| def extract_and_crop(video_path, num_frames=32): | |
| cap = cv2.VideoCapture(video_path) | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| idx = np.linspace(0, total_frames - 1, num_frames).astype(int) | |
| frames = [] | |
| for i in idx: | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, i) | |
| ret, frame = cap.read() | |
| if not ret: | |
| continue | |
| gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) | |
| faces = face_detector.detectMultiScale( | |
| gray, | |
| scaleFactor=1.3, | |
| minNeighbors=5 | |
| ) | |
| if len(faces) > 0: | |
| x, y, w, h = faces[0] | |
| face = frame[y:y+h, x:x+w] | |
| else: | |
| face = frame | |
| face = cv2.resize(face, (240,240)) | |
| face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB) | |
| frames.append(face) | |
| cap.release() | |
| return frames | |
| # TRANSFORM | |
| transform = T.Compose([ | |
| T.ToPILImage(), | |
| T.Resize((240,240)), | |
| T.ToTensor(), | |
| T.Normalize([0.5]*3,[0.5]*3) | |
| ]) | |
| # INFERENCE | |
| def run_inference(video_path): | |
| global LAST_FRAMES | |
| frames = extract_and_crop(video_path) | |
| LAST_FRAMES = frames | |
| if len(frames) == 0: | |
| return { | |
| "label": "Video tidak terbaca", | |
| "confidence": 0, | |
| "frames": [] | |
| } | |
| imgs = [] | |
| for f in frames: | |
| img = transform(f) | |
| imgs.append(img) | |
| imgs = torch.stack(imgs).unsqueeze(0).to(device) | |
| with torch.no_grad(): | |
| outputs = model(imgs) | |
| probs = torch.softmax(outputs, dim=1)[0] | |
| pred = torch.argmax(probs).item() | |
| confidence = probs[pred].item() * 100 | |
| label = "Real" if pred == 0 else "Fake" | |
| encoded_frames = [] | |
| for f in frames: | |
| _, buffer = cv2.imencode( | |
| ".jpg", | |
| cv2.cvtColor(f, cv2.COLOR_RGB2BGR) | |
| ) | |
| encoded_frames.append( | |
| base64.b64encode(buffer).decode("utf-8") | |
| ) | |
| return { | |
| "label": label, | |
| "confidence": confidence, | |
| "frames": encoded_frames | |
| } | |
| # REGION IMPORTANCE | |
| def compute_regions(cam): | |
| regions = {} | |
| regions["Forehead"] = cam[0:60, :].mean() | |
| regions["Eyes"] = cam[60:110, :].mean() | |
| regions["Cheeks"] = cam[110:170, :].mean() | |
| regions["Mouth"] = cam[170:220, :].mean() | |
| regions["Chin"] = cam[220:240, :].mean() | |
| total = sum(regions.values()) + 1e-8 | |
| result = [] | |
| for k,v in regions.items(): | |
| result.append({ | |
| "name": k, | |
| "value": float(v / total) | |
| }) | |
| return result | |
| # HEATMAP GENERATION | |
| def generate_heatmap(frame_index): | |
| global LAST_FRAMES | |
| if frame_index >= len(LAST_FRAMES): | |
| return None, None | |
| frame = LAST_FRAMES[frame_index] | |
| img = transform(frame) | |
| seq = torch.stack([img] * 32) | |
| seq = seq.unsqueeze(0).to(device) | |
| cam = grad_cam.generate(seq) | |
| regions = compute_regions(cam) | |
| heatmap = overlay_heatmap( | |
| cv2.cvtColor(frame, cv2.COLOR_RGB2BGR), | |
| cam | |
| ) | |
| return heatmap, regions |