RoxSpeed commited on
Commit
d1b88e0
·
verified ·
1 Parent(s): 7c3fff2

Upload app.py

Browse files

Grad CAM for video detection

Files changed (1) hide show
  1. app.py +501 -453
app.py CHANGED
@@ -1,454 +1,502 @@
1
- # -*- coding: utf-8 -*-
2
- # Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
3
- # --------------------------------------------------------
4
- # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
5
- # You can find the license in the LICENSE file in the root directory of this source tree.
6
- # --------------------------------------------------------
7
-
8
- import sys
9
- import os
10
- os.system(f'pip install grad-cam')
11
- os.system(f'pip install dlib')
12
- import dlib
13
- import argparse
14
- import numpy as np
15
- from PIL import Image
16
- import cv2
17
- import torch
18
- from huggingface_hub import hf_hub_download
19
- import gradio as gr
20
-
21
- import models_vit
22
- from util.datasets import build_dataset
23
- from engine_finetune import test_two_class, test_multi_class
24
- import matplotlib.pyplot as plt
25
- from torchvision import transforms
26
- import traceback
27
- from pytorch_grad_cam import (
28
- GradCAM, ScoreCAM,
29
- XGradCAM, EigenCAM
30
- )
31
- from pytorch_grad_cam import GuidedBackpropReLUModel
32
- from pytorch_grad_cam.utils.image import show_cam_on_image, preprocess_image
33
-
34
-
35
- def reshape_transform(tensor, height=14, width=14):
36
- result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2))
37
- result = result.transpose(2, 3).transpose(1, 2)
38
- return result
39
-
40
-
41
- def get_args_parser():
42
- parser = argparse.ArgumentParser('FSFM3C fine-tuning&Testing for image classification', add_help=False)
43
- parser.add_argument('--batch_size', default=64, type=int, help='Batch size per GPU')
44
- parser.add_argument('--epochs', default=50, type=int)
45
- parser.add_argument('--accum_iter', default=1, type=int, help='Accumulate gradient iterations')
46
- parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL',
47
- help='Name of model to train')
48
- parser.add_argument('--input_size', default=224, type=int, help='images input size')
49
- parser.add_argument('--normalize_from_IMN', action='store_true', help='cal mean and std from imagenet')
50
- parser.set_defaults(normalize_from_IMN=True)
51
- parser.add_argument('--apply_simple_augment', action='store_true', help='apply simple data augment')
52
- parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT', help='Drop path rate')
53
- parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM', help='Clip gradient norm')
54
- parser.add_argument('--weight_decay', type=float, default=0.05, help='weight decay')
55
- parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate')
56
- parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', help='base learning rate')
57
- parser.add_argument('--layer_decay', type=float, default=0.75, help='layer-wise lr decay')
58
- parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR', help='lower lr bound')
59
- parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N', help='epochs to warmup LR')
60
- parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT', help='Color jitter factor')
61
- parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', help='Use AutoAugment policy')
62
- parser.add_argument('--smoothing', type=float, default=0.1, help='Label smoothing')
63
- parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', help='Random erase prob')
64
- parser.add_argument('--remode', type=str, default='pixel', help='Random erase mode')
65
- parser.add_argument('--recount', type=int, default=1, help='Random erase count')
66
- parser.add_argument('--resplit', action='store_true', default=False,
67
- help='Do not random erase first augmentation split')
68
- parser.add_argument('--mixup', type=float, default=0, help='mixup alpha')
69
- parser.add_argument('--cutmix', type=float, default=0, help='cutmix alpha')
70
- parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None, help='cutmix min/max ratio')
71
- parser.add_argument('--mixup_prob', type=float, default=1.0, help='Probability of performing mixup or cutmix')
72
- parser.add_argument('--mixup_switch_prob', type=float, default=0.5, help='Probability of switching to cutmix')
73
- parser.add_argument('--mixup_mode', type=str, default='batch', help='How to apply mixup/cutmix params')
74
- parser.add_argument('--finetune', default='', help='finetune from checkpoint')
75
- parser.add_argument('--global_pool', action='store_true')
76
- parser.set_defaults(global_pool=True)
77
- parser.add_argument('--cls_token', action='store_false', dest='global_pool',
78
- help='Use class token for classification')
79
- parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str, help='dataset path')
80
- parser.add_argument('--nb_classes', default=1000, type=int, help='number of the classification types')
81
- parser.add_argument('--output_dir', default='', help='path where to save')
82
- parser.add_argument('--log_dir', default='', help='path where to tensorboard log')
83
- parser.add_argument('--device', default='cuda', help='device to use for training / testing')
84
- parser.add_argument('--seed', default=0, type=int)
85
- parser.add_argument('--resume', default='', help='resume from checkpoint')
86
- parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='start epoch')
87
- parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
88
- parser.set_defaults(eval=True)
89
- parser.add_argument('--dist_eval', action='store_true', default=False, help='Enabling distributed evaluation')
90
- parser.add_argument('--num_workers', default=10, type=int)
91
- parser.add_argument('--pin_mem', action='store_true', help='Pin CPU memory in DataLoader')
92
- parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
93
- parser.set_defaults(pin_mem=True)
94
- parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
95
- parser.add_argument('--local_rank', default=-1, type=int)
96
- parser.add_argument('--dist_on_itp', action='store_true')
97
- parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
98
- return parser
99
-
100
-
101
- def load_model(select_skpt):
102
- global ckpt, device, model, checkpoint
103
- if select_skpt not in CKPT_NAME:
104
- return gr.update(), "Select a correct model"
105
- ckpt = select_skpt
106
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
107
- args.nb_classes = CKPT_CLASS[ckpt]
108
- model = models_vit.__dict__[CKPT_MODEL[ckpt]](
109
- num_classes=args.nb_classes,
110
- drop_path_rate=args.drop_path,
111
- global_pool=args.global_pool,
112
- ).to(device)
113
-
114
- args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_PATH[ckpt])
115
- if os.path.isfile(args.resume) == False:
116
- hf_hub_download(local_dir=CKPT_SAVE_PATH,
117
- local_dir_use_symlinks=False,
118
- repo_id='Wolowolo/fsfm-3c',
119
- filename=CKPT_PATH[ckpt])
120
- args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_PATH[ckpt])
121
- checkpoint = torch.load(args.resume, map_location=device)
122
- model.load_state_dict(checkpoint['model'], strict=False)
123
- model.eval()
124
- global cam
125
- cam = GradCAM(model=model,
126
- target_layers=[model.blocks[-1].norm1],
127
- reshape_transform=reshape_transform
128
- )
129
- return gr.update(), f"[Loaded Model Successfully:] {args.resume}] "
130
-
131
-
132
- def get_boundingbox(face, width, height, minsize=None):
133
- x1, y1, x2, y2 = face.left(), face.top(), face.right(), face.bottom()
134
- size_bb = int(max(x2 - x1, y2 - y1) * 1.3)
135
- if minsize and size_bb < minsize:
136
- size_bb = minsize
137
- center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
138
- x1, y1 = max(int(center_x - size_bb // 2), 0), max(int(center_y - size_bb // 2), 0)
139
- size_bb = min(width - x1, size_bb)
140
- size_bb = min(height - y1, size_bb)
141
- return x1, y1, size_bb
142
-
143
-
144
- def extract_face(frame):
145
- face_detector = dlib.get_frontal_face_detector()
146
- image = np.array(frame.convert('RGB'))
147
- faces = face_detector(image, 1)
148
- if faces:
149
- face = faces[0]
150
- x, y, size = get_boundingbox(face, image.shape[1], image.shape[0])
151
- cropped_face = image[y:y + size, x:x + size]
152
- return Image.fromarray(cropped_face)
153
- return None
154
-
155
-
156
- def get_frame_index_uniform_sample(total_frame_num, extract_frame_num):
157
- return np.linspace(0, total_frame_num - 1, num=extract_frame_num, dtype=int).tolist()
158
-
159
-
160
- def extract_face_from_fixed_num_frames(src_video, dst_path, num_frames=None):
161
- video_capture = cv2.VideoCapture(src_video)
162
- total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
163
- frame_indices = get_frame_index_uniform_sample(total_frames, num_frames) if num_frames else range(total_frames)
164
- for frame_index in frame_indices:
165
- video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
166
- ret, frame = video_capture.read()
167
- if not ret:
168
- continue
169
- image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
170
- img = extract_face(image)
171
- if img:
172
- img = img.resize((224, 224), Image.BICUBIC)
173
- save_img_name = f"frame_{frame_index}.png"
174
- img.save(os.path.join(dst_path, '0', save_img_name))
175
- video_capture.release()
176
- return frame_indices
177
-
178
-
179
- class TargetCategory:
180
- def __init__(self, category_index):
181
- self.category_index = category_index
182
-
183
- def __call__(self, output):
184
- return output[self.category_index]
185
-
186
-
187
- def preprocess_image_cam(pil_img,
188
- mean=[0.5482207536697388, 0.42340534925460815, 0.3654651641845703],
189
- std=[0.2789176106452942, 0.2438540756702423, 0.23493893444538116]):
190
- img_np = np.array(pil_img)
191
- img_np = img_np.astype(np.float32) / 255.0
192
- img_np = (img_np - mean) / std
193
- img_np = np.transpose(img_np, (2, 0, 1))
194
- img_np = np.expand_dims(img_np, axis=0)
195
- return img_np
196
-
197
-
198
- def FSFM3C_image_detection(image):
199
- frame_path = os.path.join(FRAME_SAVE_PATH, str(len(os.listdir(FRAME_SAVE_PATH))))
200
- os.makedirs(frame_path, exist_ok=True)
201
- os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
202
- img = extract_face(image)
203
- if img is None:
204
- return 'No face detected, please upload a clear face!'
205
- img = img.resize((224, 224), Image.BICUBIC)
206
- img.save(os.path.join(frame_path, '0', "frame_0.png"))
207
- args.data_path = frame_path
208
- args.batch_size = 1
209
- dataset_val = build_dataset(is_train=False, args=args)
210
- sampler_val = torch.utils.data.SequentialSampler(dataset_val)
211
- data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=args.batch_size,
212
- num_workers=args.num_workers, pin_memory=args.pin_mem,
213
- drop_last=False)
214
-
215
- if CKPT_CLASS[ckpt] > 2:
216
- frame_preds_list, video_pred_list = test_multi_class(data_loader_val, model, device)
217
- class_names = ['Real or Bonafide', 'Deepfake', 'Diffusion or AIGC generated', 'Spoofing or Presentation-attack']
218
- avg_video_pred = np.mean(video_pred_list, axis=0)
219
- max_prob_index = np.argmax(avg_video_pred)
220
- max_prob_class = class_names[max_prob_index]
221
- probabilities = [f"{class_names[i]}: {prob * 100:.1f}%" for i, prob in enumerate(avg_video_pred)]
222
- image_results = f"The largest face in this image may be {max_prob_class} with probability: \n [{', '.join(probabilities)}]"
223
-
224
- # Generate CAM heatmap for the detected class
225
- use_cuda = torch.cuda.is_available()
226
- input_tensor = preprocess_image(img,
227
- mean=[0.5482207536697388, 0.42340534925460815, 0.3654651641845703],
228
- std=[0.2789176106452942, 0.2438540756702423, 0.23493893444538116])
229
- if use_cuda:
230
- input_tensor = input_tensor.cuda()
231
-
232
- # Dynamically determine the target category based on the maximum probability class
233
- category_names_to_index = {
234
- 'Real or Bonafide': 0,
235
- 'Deepfake': 1,
236
- 'Diffusion or AIGC generated': 2,
237
- 'Spoofing or Presentation-attack': 3
238
- }
239
- target_category = TargetCategory(category_names_to_index[max_prob_class])
240
-
241
- cam = GradCAM(model=model,
242
- target_layers=[model.blocks[-1].norm1],
243
- reshape_transform=reshape_transform
244
- )
245
- grayscale_cam = cam(input_tensor=input_tensor, targets=[target_category], aug_smooth=False, eigen_smooth=True)
246
- grayscale_cam = 1 - grayscale_cam[0, :]
247
- img = np.array(img)
248
- if img.shape[2] == 4:
249
- img = img[:, :, :3]
250
- img = img.astype(np.float32) / 255.0
251
- visualization = show_cam_on_image(img, grayscale_cam)
252
- visualization = cv2.cvtColor(visualization, cv2.COLOR_RGB2BGR)
253
-
254
- # Add text overlay to the heatmap
255
- # text = f"Detected: {max_prob_class}"
256
- # cv2.putText(visualization, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
257
- cam_path = os.path.join(CAM_SAVE_PATH, str(len(os.listdir(CAM_SAVE_PATH))))
258
- os.makedirs(cam_path, exist_ok=True)
259
- os.makedirs(os.path.join(cam_path, '0'), exist_ok=True)
260
- output_path = os.path.join(cam_path, "output_heatmap.png")
261
- cv2.imwrite(output_path, visualization)
262
- return image_results, output_path, probabilities[max_prob_index]
263
-
264
- if CKPT_CLASS[ckpt] == 2:
265
- frame_preds_list, video_pred_list = test_two_class(data_loader_val, model, device)
266
- if ckpt == 'DfD-Checkpoint_Fine-tuned_on_FF++':
267
- prob = sum(video_pred_list) / len(video_pred_list)
268
- label = "Deepfake" if prob <= 0.5 else "Real"
269
- prob = prob if label == "Real" else 1 - prob
270
- if ckpt == 'FAS-Checkpoint_Fine-tuned_on_MCIO':
271
- prob = sum(video_pred_list) / len(video_pred_list)
272
- label = "Spoofing" if prob <= 0.5 else "Bonafide"
273
- prob = prob if label == "Bonafide" else 1 - prob
274
- image_results = f"The largest face in this image may be {label} with probability {prob * 100:.1f}%"
275
- return image_results, None, None
276
-
277
-
278
- def FSFM3C_video_detection(video, num_frames):
279
- try:
280
- frame_path = os.path.join(FRAME_SAVE_PATH, str(len(os.listdir(FRAME_SAVE_PATH))))
281
- os.makedirs(frame_path, exist_ok=True)
282
- os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
283
- frame_indices = extract_face_from_fixed_num_frames(video, frame_path, num_frames=num_frames)
284
- args.data_path = frame_path
285
- args.batch_size = num_frames
286
- dataset_val = build_dataset(is_train=False, args=args)
287
- sampler_val = torch.utils.data.SequentialSampler(dataset_val)
288
- data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=args.batch_size,
289
- num_workers=args.num_workers, pin_memory=args.pin_mem,
290
- drop_last=False)
291
-
292
- if CKPT_CLASS[ckpt] > 2:
293
- frame_preds_list, video_pred_list = test_multi_class(data_loader_val, model, device)
294
- class_names = ['Real or Bonafide', 'Deepfake', 'Diffusion or AIGC generated',
295
- 'Spoofing or Presentation-attack']
296
- avg_video_pred = np.mean(video_pred_list, axis=0)
297
- max_prob_index = np.argmax(avg_video_pred)
298
- max_prob_class = class_names[max_prob_index]
299
- probabilities = [f"{class_names[i]}: {prob * 100:.1f}%" for i, prob in enumerate(avg_video_pred)]
300
-
301
- frame_results = {f"frame_{frame_indices[i]}": [f"{class_names[j]}: {prob * 100:.1f}%" for j, prob in
302
- enumerate(frame_preds_list[i])] for i in
303
- range(len(frame_indices))}
304
- video_results = (
305
- f"The largest face in this image may be {max_prob_class} with probability: \n [{', '.join(probabilities)}]\n \n"
306
- f"The frame-level detection results ['frame_index': 'probabilities']: \n{frame_results}")
307
- return video_results
308
-
309
- if CKPT_CLASS[ckpt] == 2:
310
- frame_preds_list, video_pred_list = test_two_class(data_loader_val, model, device)
311
- if ckpt == 'DfD-Checkpoint_Fine-tuned_on_FF++':
312
- prob = sum(video_pred_list) / len(video_pred_list)
313
- label = "Deepfake" if prob <= 0.5 else "Real"
314
- prob = prob if label == "Real" else 1 - prob
315
- frame_results = {f"frame_{frame_indices[i]}": f"{(frame_preds_list[i]) * 100:.1f}%" for i in
316
- range(len(frame_indices))} if label == "Real" else {
317
- f"frame_{frame_indices[i]}": f"{(1 - frame_preds_list[i]) * 100:.1f}%" for i in
318
- range(len(frame_indices))}
319
-
320
- if ckpt == 'FAS-Checkpoint_Fine-tuned_on_MCIO':
321
- prob = sum(video_pred_list) / len(video_pred_list)
322
- label = "Spoofing" if prob <= 0.5 else "Bonafide"
323
- prob = prob if label == "Bonafide" else 1 - prob
324
- frame_results = {f"frame_{frame_indices[i]}": f"{(frame_preds_list[i]) * 100:.1f}%" for i in
325
- range(len(frame_indices))} if label == "Bonafide" else {
326
- f"frame_{frame_indices[i]}": f"{(1 - frame_preds_list[i]) * 100:.1f}%" for i in
327
- range(len(frame_indices))}
328
-
329
- video_results = (f"The largest face in this image may be {label} with probability {prob * 100:.1f}%\n \n"
330
- f"The frame-level detection results ['frame_index': 'real_face_probability']: \n{frame_results}")
331
- return video_results
332
- except Exception as e:
333
- return f"Error occurred. Please provide a clear face video or reduce the number of frames."
334
-
335
-
336
- # Paths and Constants
337
- P = os.path.abspath(__file__)
338
- FRAME_SAVE_PATH = os.path.join(os.path.dirname(P), 'frame')
339
- CAM_SAVE_PATH = os.path.join(os.path.dirname(P), 'cam')
340
- CKPT_SAVE_PATH = os.path.join(os.path.dirname(P), 'checkpoints')
341
- os.makedirs(FRAME_SAVE_PATH, exist_ok=True)
342
- os.makedirs(CAM_SAVE_PATH, exist_ok=True)
343
- os.makedirs(CKPT_SAVE_PATH, exist_ok=True)
344
- CKPT_NAME = [
345
- '✨Unified-detector_v1_Fine-tuned_on_4_classes',
346
- 'DfD-Checkpoint_Fine-tuned_on_FF++',
347
- 'FAS-Checkpoint_Fine-tuned_on_MCIO',
348
- ]
349
- CKPT_PATH = {
350
- '✨Unified-detector_v1_Fine-tuned_on_4_classes': 'finetuned_models/Unified-detector/v1_Fine-tuned_on_4_classes/checkpoint-min_train_loss.pth',
351
- 'DfD-Checkpoint_Fine-tuned_on_FF++': 'finetuned_models/FF++_c23_32frames/checkpoint-min_val_loss.pth',
352
- 'FAS-Checkpoint_Fine-tuned_on_MCIO': 'finetuned_models/MCIO_protocol/Both_MCIO/checkpoint-min_val_loss.pth',
353
- }
354
- CKPT_CLASS = {
355
- '✨Unified-detector_v1_Fine-tuned_on_4_classes': 4,
356
- 'DfD-Checkpoint_Fine-tuned_on_FF++': 2,
357
- 'FAS-Checkpoint_Fine-tuned_on_MCIO': 2
358
- }
359
- CKPT_MODEL = {
360
- '✨Unified-detector_v1_Fine-tuned_on_4_classes': 'vit_base_patch16',
361
- 'DfD-Checkpoint_Fine-tuned_on_FF++': 'vit_base_patch16',
362
- 'FAS-Checkpoint_Fine-tuned_on_MCIO': 'vit_base_patch16',
363
- }
364
-
365
- with gr.Blocks(css=".custom-label { font-weight: bold !important; font-size: 16px !important; }") as demo:
366
- gr.HTML(
367
- "<h1 style='text-align: center;'>🦱 Real Facial Image&Video Detection <br> Against Face Forgery (Deepfake/Diffusion) and Spoofing (Presentation-attacks)</h1>")
368
- gr.Markdown(
369
- "<b>☉ Powered by the fine-tuned ViT models that is pre-trained from [FSFM-3C](https://fsfm-3c.github.io/)</b> <br> "
370
- "<b>☉ We do not and cannot access or store the data you have uploaded!</b> <br> "
371
- "<b>☉ Release (Continuously updating [by [Gaojian Wang/汪高健](https://scholar.google.com/citations?user=tpP4cFQAAAAJ&hl=zh-CN&oi=ao), [Tong Wu/吴桐](https://github.com/Coco-T-T), [Xingtang Luo/罗兴塘](https://github.com/Rox-C)]) </b> <br> <b>[V1.0] 2025/02/22-Current🎉</b>: "
372
- "1) Updated <b>[✨Unified-detector_v1] for Physical-Digital Face Attack&Forgery Detection, a ViT-B/16-224 (FSFM Pre-trained) detector that could identify Real&Bonafide, Deepfake, Diffusion&AIGC, Spooing&Presentation-attacks facial images or videos </b> ; 2) Provided the selection of the number of video frames (uniformly sampling 1-32 frames, more frames may time-consuming for this page without GPU acceleration); 3) Fixed some errors of V0.1. <br>"
373
- "<b>[V0.1] 2024/12-2025/02/21</b>: "
374
- "Create this page with basic detectors [DfD-Checkpoint_Fine-tuned_on_FF++, FAS-Checkpoint_Fine-tuned_on_MCIO] that follow the paper implementation. <br> ")
375
- gr.Markdown(
376
- "- Please <b>provide a facial image or video(<100s)</b>, and <b>select the model</b> for detection: <br> <b>[SUGGEST] [✨Unified-detector_v1_Fine-tuned_on_4_classes]</b> a (FSFM Pre-trained) ViT-B/16-224 for Both Real/Deepfake/Diffusion/Spoofing facial images&videos Detection <br> <b>[DfD-Checkpoint_Fine-tuned_on_FF++]</b> for deepfake detection, FSFM ViT-B/16-224 fine-tuned on the FF++_c23 train&val sets (4 manipulations, 32 frames per video) <br> <b>[FAS-Checkpoint_Fine-tuned_on_MCIO]</b> for face anti-spoofing, FSFM ViT-B/16-224 fine-tuned on the MCIO datasets (2 frames per video)")
377
-
378
- with gr.Row():
379
- ckpt_select_dropdown = gr.Dropdown(
380
- label="Select the Model for Detection ⬇️",
381
- elem_classes="custom-label",
382
- choices=['Choose Model Here 🖱️'] + CKPT_NAME + ['continuously updating...'],
383
- multiselect=False,
384
- value='Choose Model Here 🖱️',
385
- interactive=True,
386
- )
387
- model_loading_status = gr.Textbox(label="Model Loading Status")
388
- with gr.Row():
389
- with gr.Column(scale=5):
390
- gr.Markdown(
391
- "### Image Detection (Fast Try: copying image from [whichfaceisreal](https://whichfaceisreal.com/))")
392
- image = gr.Image(label="Upload/Capture/Paste your image", type="pil")
393
- image_submit_btn = gr.Button("Submit")
394
- output_results_image = gr.Textbox(label="Detection Result")
395
-
396
- with gr.Row():
397
- output_heatmap = gr.Image(label="Grad_CAM")
398
- output_max_prob_class = gr.Textbox(label="Detected Class")
399
- with gr.Column(scale=5):
400
- gr.Markdown("### Video Detection")
401
- video = gr.Video(label="Upload/Capture your video")
402
- frame_slider = gr.Slider(minimum=1, maximum=32, step=1, value=32, label="Number of Frames for Detection")
403
- video_submit_btn = gr.Button("Submit")
404
- output_results_video = gr.Textbox(label="Detection Result")
405
-
406
- gr.HTML(
407
- '<div style="display: flex; justify-content: center; gap: 20px; margin-bottom: 20px;">'
408
- '<a href="https://mapmyvisitors.com/web/1bxvi" title="Visit tracker">'
409
- '<img src="https://mapmyvisitors.com/map.png?d=FYhBoxLDEaFAxdfRzk5TuchYOBGrnSa98Ky59EkEEpY&cl=ffffff">'
410
- '</a>'
411
- '</div>'
412
- )
413
-
414
- ckpt_select_dropdown.change(
415
- fn=load_model,
416
- inputs=[ckpt_select_dropdown],
417
- outputs=[ckpt_select_dropdown, model_loading_status],
418
- )
419
- image_submit_btn.click(
420
- fn=FSFM3C_image_detection,
421
- inputs=[image],
422
- outputs=[output_results_image, output_heatmap, output_max_prob_class],
423
- )
424
- video_submit_btn.click(
425
- fn=FSFM3C_video_detection,
426
- inputs=[video, frame_slider],
427
- outputs=[output_results_video],
428
- )
429
-
430
- if __name__ == "__main__":
431
- args = get_args_parser()
432
- args = args.parse_args()
433
- ckpt = '✨Unified-detector_v1_Fine-tuned_on_4_classes'
434
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
435
- args.nb_classes = CKPT_CLASS[ckpt]
436
- model = models_vit.__dict__[CKPT_MODEL[ckpt]](
437
- num_classes=args.nb_classes,
438
- drop_path_rate=args.drop_path,
439
- global_pool=args.global_pool,
440
- ).to(device)
441
- args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_PATH[ckpt])
442
- if os.path.isfile(args.resume) == False:
443
- hf_hub_download(local_dir=CKPT_SAVE_PATH,
444
- local_dir_use_symlinks=False,
445
- repo_id='Wolowolo/fsfm-3c',
446
- filename=CKPT_PATH[ckpt])
447
- args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_PATH[ckpt])
448
- checkpoint = torch.load(args.resume, map_location=device)
449
- model.load_state_dict(checkpoint['model'], strict=False)
450
- model.eval()
451
-
452
- gr.close_all()
453
- demo.queue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  demo.launch()
 
1
+ # -*- coding: utf-8 -*-
2
+ # Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
3
+ # --------------------------------------------------------
4
+ # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
5
+ # You can find the license in the LICENSE file in the root directory of this source tree.
6
+ # --------------------------------------------------------
7
+
8
+ import sys
9
+ import os
10
+ os.system(f'pip install grad-cam')
11
+ os.system(f'pip install dlib')
12
+ import dlib
13
+ import argparse
14
+ import numpy as np
15
+ from PIL import Image
16
+ import cv2
17
+ import torch
18
+ from huggingface_hub import hf_hub_download
19
+ import gradio as gr
20
+
21
+ import models_vit
22
+ from util.datasets import build_dataset
23
+ from engine_finetune import test_two_class, test_multi_class
24
+ import matplotlib.pyplot as plt
25
+ from torchvision import transforms
26
+ import traceback
27
+ from pytorch_grad_cam import (
28
+ GradCAM, ScoreCAM,
29
+ XGradCAM, EigenCAM
30
+ )
31
+ from pytorch_grad_cam import GuidedBackpropReLUModel
32
+ from pytorch_grad_cam.utils.image import show_cam_on_image, preprocess_image
33
+ import traceback
34
+
35
+ def reshape_transform(tensor, height=14, width=14):
36
+ result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2))
37
+ result = result.transpose(2, 3).transpose(1, 2)
38
+ return result
39
+
40
+
41
+ def get_args_parser():
42
+ parser = argparse.ArgumentParser('FSFM3C fine-tuning&Testing for image classification', add_help=False)
43
+ parser.add_argument('--batch_size', default=64, type=int, help='Batch size per GPU')
44
+ parser.add_argument('--epochs', default=50, type=int)
45
+ parser.add_argument('--accum_iter', default=1, type=int, help='Accumulate gradient iterations')
46
+ parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL',
47
+ help='Name of model to train')
48
+ parser.add_argument('--input_size', default=224, type=int, help='images input size')
49
+ parser.add_argument('--normalize_from_IMN', action='store_true', help='cal mean and std from imagenet')
50
+ parser.set_defaults(normalize_from_IMN=True)
51
+ parser.add_argument('--apply_simple_augment', action='store_true', help='apply simple data augment')
52
+ parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT', help='Drop path rate')
53
+ parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM', help='Clip gradient norm')
54
+ parser.add_argument('--weight_decay', type=float, default=0.05, help='weight decay')
55
+ parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate')
56
+ parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', help='base learning rate')
57
+ parser.add_argument('--layer_decay', type=float, default=0.75, help='layer-wise lr decay')
58
+ parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR', help='lower lr bound')
59
+ parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N', help='epochs to warmup LR')
60
+ parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT', help='Color jitter factor')
61
+ parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', help='Use AutoAugment policy')
62
+ parser.add_argument('--smoothing', type=float, default=0.1, help='Label smoothing')
63
+ parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', help='Random erase prob')
64
+ parser.add_argument('--remode', type=str, default='pixel', help='Random erase mode')
65
+ parser.add_argument('--recount', type=int, default=1, help='Random erase count')
66
+ parser.add_argument('--resplit', action='store_true', default=False,
67
+ help='Do not random erase first augmentation split')
68
+ parser.add_argument('--mixup', type=float, default=0, help='mixup alpha')
69
+ parser.add_argument('--cutmix', type=float, default=0, help='cutmix alpha')
70
+ parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None, help='cutmix min/max ratio')
71
+ parser.add_argument('--mixup_prob', type=float, default=1.0, help='Probability of performing mixup or cutmix')
72
+ parser.add_argument('--mixup_switch_prob', type=float, default=0.5, help='Probability of switching to cutmix')
73
+ parser.add_argument('--mixup_mode', type=str, default='batch', help='How to apply mixup/cutmix params')
74
+ parser.add_argument('--finetune', default='', help='finetune from checkpoint')
75
+ parser.add_argument('--global_pool', action='store_true')
76
+ parser.set_defaults(global_pool=True)
77
+ parser.add_argument('--cls_token', action='store_false', dest='global_pool',
78
+ help='Use class token for classification')
79
+ parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str, help='dataset path')
80
+ parser.add_argument('--nb_classes', default=1000, type=int, help='number of the classification types')
81
+ parser.add_argument('--output_dir', default='', help='path where to save')
82
+ parser.add_argument('--log_dir', default='', help='path where to tensorboard log')
83
+ parser.add_argument('--device', default='cuda', help='device to use for training / testing')
84
+ parser.add_argument('--seed', default=0, type=int)
85
+ parser.add_argument('--resume', default='', help='resume from checkpoint')
86
+ parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='start epoch')
87
+ parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
88
+ parser.set_defaults(eval=True)
89
+ parser.add_argument('--dist_eval', action='store_true', default=False, help='Enabling distributed evaluation')
90
+ parser.add_argument('--num_workers', default=10, type=int)
91
+ parser.add_argument('--pin_mem', action='store_true', help='Pin CPU memory in DataLoader')
92
+ parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
93
+ parser.set_defaults(pin_mem=True)
94
+ parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
95
+ parser.add_argument('--local_rank', default=-1, type=int)
96
+ parser.add_argument('--dist_on_itp', action='store_true')
97
+ parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
98
+ return parser
99
+
100
+
101
+ def load_model(select_skpt):
102
+ global ckpt, device, model, checkpoint
103
+ if select_skpt not in CKPT_NAME:
104
+ return gr.update(), "Select a correct model"
105
+ ckpt = select_skpt
106
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
107
+ args.nb_classes = CKPT_CLASS[ckpt]
108
+ model = models_vit.__dict__[CKPT_MODEL[ckpt]](
109
+ num_classes=args.nb_classes,
110
+ drop_path_rate=args.drop_path,
111
+ global_pool=args.global_pool,
112
+ ).to(device)
113
+
114
+ args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_PATH[ckpt])
115
+ if os.path.isfile(args.resume) == False:
116
+ hf_hub_download(local_dir=CKPT_SAVE_PATH,
117
+ local_dir_use_symlinks=False,
118
+ repo_id='Wolowolo/fsfm-3c',
119
+ filename=CKPT_PATH[ckpt])
120
+ args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_PATH[ckpt])
121
+ checkpoint = torch.load(args.resume, map_location=device)
122
+ model.load_state_dict(checkpoint['model'], strict=False)
123
+ model.eval()
124
+ global cam
125
+ cam = GradCAM(model=model,
126
+ target_layers=[model.blocks[-1].norm1],
127
+ reshape_transform=reshape_transform
128
+ )
129
+ return gr.update(), f"[Loaded Model Successfully:] {args.resume}] "
130
+
131
+
132
+ def get_boundingbox(face, width, height, minsize=None):
133
+ x1, y1, x2, y2 = face.left(), face.top(), face.right(), face.bottom()
134
+ size_bb = int(max(x2 - x1, y2 - y1) * 1.3)
135
+ if minsize and size_bb < minsize:
136
+ size_bb = minsize
137
+ center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
138
+ x1, y1 = max(int(center_x - size_bb // 2), 0), max(int(center_y - size_bb // 2), 0)
139
+ size_bb = min(width - x1, size_bb)
140
+ size_bb = min(height - y1, size_bb)
141
+ return x1, y1, size_bb
142
+
143
+
144
+ def extract_face(frame):
145
+ face_detector = dlib.get_frontal_face_detector()
146
+ image = np.array(frame.convert('RGB'))
147
+ faces = face_detector(image, 1)
148
+ if faces:
149
+ face = faces[0]
150
+ x, y, size = get_boundingbox(face, image.shape[1], image.shape[0])
151
+ cropped_face = image[y:y + size, x:x + size]
152
+ return Image.fromarray(cropped_face)
153
+ return None
154
+
155
+
156
+ def get_frame_index_uniform_sample(total_frame_num, extract_frame_num):
157
+ return np.linspace(0, total_frame_num - 1, num=extract_frame_num+1, dtype=int).tolist()
158
+
159
+
160
+ def extract_face_from_fixed_num_frames(src_video, dst_path, num_frames=None):
161
+ video_capture = cv2.VideoCapture(src_video)
162
+ total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
163
+ frame_indices = get_frame_index_uniform_sample(total_frames, num_frames) if num_frames else range(total_frames)
164
+ for frame_index in frame_indices:
165
+ video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
166
+ ret, frame = video_capture.read()
167
+ if not ret:
168
+ continue
169
+ image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
170
+ img = extract_face(image)
171
+ if img:
172
+ img = img.resize((224, 224), Image.BICUBIC)
173
+ save_img_name = f"frame_{frame_index}.png"
174
+ img.save(os.path.join(dst_path, '0', save_img_name))
175
+ video_capture.release()
176
+ return frame_indices
177
+
178
+
179
+ class TargetCategory:
180
+ def __init__(self, category_index):
181
+ self.category_index = category_index
182
+
183
+ def __call__(self, output):
184
+ return output[self.category_index]
185
+
186
+
187
+ def preprocess_image_cam(pil_img,
188
+ mean=[0.5482207536697388, 0.42340534925460815, 0.3654651641845703],
189
+ std=[0.2789176106452942, 0.2438540756702423, 0.23493893444538116]):
190
+ img_np = np.array(pil_img)
191
+ img_np = img_np.astype(np.float32) / 255.0
192
+ img_np = (img_np - mean) / std
193
+ img_np = np.transpose(img_np, (2, 0, 1))
194
+ img_np = np.expand_dims(img_np, axis=0)
195
+ return img_np
196
+
197
+
198
+ def FSFM3C_image_detection(image):
199
+ frame_path = os.path.join(FRAME_SAVE_PATH, str(len(os.listdir(FRAME_SAVE_PATH))))
200
+ os.makedirs(frame_path, exist_ok=True)
201
+ os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
202
+ img = extract_face(image)
203
+ if img is None:
204
+ return 'No face detected, please upload a clear face!'
205
+ img = img.resize((224, 224), Image.BICUBIC)
206
+ img.save(os.path.join(frame_path, '0', "frame_0.png"))
207
+ args.data_path = frame_path
208
+ args.batch_size = 1
209
+ dataset_val = build_dataset(is_train=False, args=args)
210
+ sampler_val = torch.utils.data.SequentialSampler(dataset_val)
211
+ data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=args.batch_size,
212
+ num_workers=args.num_workers, pin_memory=args.pin_mem,
213
+ drop_last=False)
214
+
215
+ if CKPT_CLASS[ckpt] > 2:
216
+ frame_preds_list, video_pred_list = test_multi_class(data_loader_val, model, device)
217
+ class_names = ['Real or Bonafide', 'Deepfake', 'Diffusion or AIGC generated', 'Spoofing or Presentation-attack']
218
+ avg_video_pred = np.mean(video_pred_list, axis=0)
219
+ max_prob_index = np.argmax(avg_video_pred)
220
+ max_prob_class = class_names[max_prob_index]
221
+
222
+ probabilities = [f"{class_names[i]}: {prob * 100:.1f}%" for i, prob in enumerate(avg_video_pred)]
223
+ image_results = f"The largest face in this image may be {max_prob_class} with probability: \n [{', '.join(probabilities)}]"
224
+
225
+ return image_results,max_prob_class
226
+
227
+ if CKPT_CLASS[ckpt] == 2:
228
+ frame_preds_list, video_pred_list = test_two_class(data_loader_val, model, device)
229
+ if ckpt == 'DfD-Checkpoint_Fine-tuned_on_FF++':
230
+ prob = sum(video_pred_list) / len(video_pred_list)
231
+ label = "Deepfake" if prob <= 0.5 else "Real"
232
+ prob = prob if label == "Real" else 1 - prob
233
+ if ckpt == 'FAS-Checkpoint_Fine-tuned_on_MCIO':
234
+ prob = sum(video_pred_list) / len(video_pred_list)
235
+ label = "Spoofing" if prob <= 0.5 else "Bonafide"
236
+ prob = prob if label == "Bonafide" else 1 - prob
237
+ image_results = f"The largest face in this image may be {label} with probability {prob * 100:.1f}%"
238
+ return image_results,label
239
+
240
+ def generate_Grad_CAM(image,max_prob_class,cam_path,frame_index=None):
241
+ # Generate CAM heatmap for the detected class
242
+ img = extract_face(image)
243
+ if img is None:
244
+ return 'No face detected, please upload a clear face!'
245
+ img = img.resize((224, 224), Image.BICUBIC)
246
+ use_cuda = torch.cuda.is_available()
247
+ input_tensor = preprocess_image(img,
248
+ mean=[0.5482207536697388, 0.42340534925460815, 0.3654651641845703],
249
+ std=[0.2789176106452942, 0.2438540756702423, 0.23493893444538116])
250
+ if use_cuda:
251
+ input_tensor = input_tensor.cuda()
252
+
253
+ # Dynamically determine the target category based on the maximum probability class
254
+ category_names_to_index = {
255
+ 'Real or Bonafide': 0,
256
+ 'Deepfake': 1,
257
+ 'Diffusion or AIGC generated': 2,
258
+ 'Spoofing or Presentation-attack': 3
259
+ }
260
+ target_category = TargetCategory(category_names_to_index[max_prob_class])
261
+
262
+ cam = GradCAM(model=model,
263
+ target_layers=[model.blocks[-1].norm1],
264
+ reshape_transform=reshape_transform
265
+ )
266
+ grayscale_cam = cam(input_tensor=input_tensor, targets=[target_category], aug_smooth=False, eigen_smooth=True)
267
+ grayscale_cam = 1 - grayscale_cam[0, :]
268
+ img = np.array(img)
269
+ if img.shape[2] == 4:
270
+ img = img[:, :, :3]
271
+ img = img.astype(np.float32) / 255.0
272
+ visualization = show_cam_on_image(img, grayscale_cam)
273
+ visualization = cv2.cvtColor(visualization, cv2.COLOR_RGB2BGR)
274
+ if frame_index is not None:
275
+ output_path = os.path.join(cam_path, f"frame_{frame_index}_heatmap.png")
276
+ else:
277
+ output_path = os.path.join(cam_path, "output_heatmap.png")
278
+ cv2.imwrite(output_path, visualization)
279
+ return output_path
280
+
281
+ def image_Grad_CAM(image,max_prob_class):
282
+ # max_prob_class = None
283
+ # if output_results_image.startswith("The largest face in this image may be"):
284
+ # parts = output_results_image.split("may be ")[1].split(" with probability")[0].strip()
285
+ # max_prob_class = parts
286
+ cam_path = os.path.join(CAM_SAVE_PATH, str(len(os.listdir(CAM_SAVE_PATH))))
287
+ os.makedirs(cam_path, exist_ok=True)
288
+ os.makedirs(os.path.join(cam_path, '0'), exist_ok=True)
289
+ cam_path = os.path.join(cam_path,'0')
290
+ image_CAM = generate_Grad_CAM(image,max_prob_class,cam_path)
291
+ return image_CAM
292
+
293
+ def video_Grad_CAM(max_prob_class):
294
+ # max_prob_class = None
295
+ # if output_results_video.startswith("The largest face in this image may be"):
296
+ # parts = output_results_video.split("may be ")[1].split(" with probability")[0].strip()
297
+ # max_prob_class = parts
298
+ video_CAM = []
299
+ frame_path = os.path.join(args.data_path,"0")
300
+ frame_files = [f for f in os.listdir(frame_path) if f.startswith("frame_") and f.endswith(".png")]
301
+ frame_files.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))
302
+ cam_path = os.path.join(CAM_SAVE_PATH, str(len(os.listdir(CAM_SAVE_PATH))))
303
+ os.makedirs(cam_path, exist_ok=True)
304
+ os.makedirs(os.path.join(cam_path, '0'), exist_ok=True)
305
+ cam_path = os.path.join(cam_path,'0')
306
+ for frame_file in frame_files:
307
+ frame_input_path = os.path.join(frame_path, frame_file)
308
+ image_input = Image.open(frame_input_path)
309
+ frame_index = int(frame_file.split("_")[1].split(".")[0])
310
+ frame_CAM = generate_Grad_CAM(image_input,max_prob_class,cam_path, frame_index=frame_index)
311
+ video_CAM.append(frame_CAM)
312
+ return video_CAM
313
+
314
+
315
+ def FSFM3C_video_detection(video, num_frames):
316
+ try:
317
+ frame_path = os.path.join(FRAME_SAVE_PATH, str(len(os.listdir(FRAME_SAVE_PATH))))
318
+ os.makedirs(frame_path, exist_ok=True)
319
+ os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
320
+ frame_indices = extract_face_from_fixed_num_frames(video, frame_path, num_frames=num_frames)
321
+ args.data_path = frame_path
322
+ args.batch_size = num_frames
323
+ dataset_val = build_dataset(is_train=False, args=args)
324
+ sampler_val = torch.utils.data.SequentialSampler(dataset_val)
325
+ data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=args.batch_size,
326
+ num_workers=args.num_workers, pin_memory=args.pin_mem,
327
+ drop_last=False)
328
+ if CKPT_CLASS[ckpt] > 2:
329
+ frame_preds_list, video_pred_list = test_multi_class(data_loader_val, model, device)
330
+ class_names = ['Real or Bonafide', 'Deepfake', 'Diffusion or AIGC generated',
331
+ 'Spoofing or Presentation-attack']
332
+ avg_video_pred = np.mean(video_pred_list, axis=0)
333
+ max_prob_index = np.argmax(avg_video_pred)
334
+ max_prob_class = class_names[max_prob_index]
335
+ probabilities = [f"{class_names[i]}: {prob * 100:.1f}%" for i, prob in enumerate(avg_video_pred)]
336
+
337
+ # 手动格式化 frame_results,确保每一帧的结果之间有换行
338
+ frame_results = "\n".join([f"frame_{frame_indices[i]}: {', '.join([f'{class_names[j]}: {prob * 100:.1f}%' for j, prob in enumerate(frame_preds_list[i])])}" for i in range(len(frame_indices) - 1)])
339
+
340
+ video_results = (
341
+ f"The largest face in this image may be {max_prob_class} with probability: \n [{', '.join(probabilities)}]\n \n"
342
+ f"The frame-level detection results:\n {frame_results}")
343
+ return video_results, max_prob_class
344
+
345
+ if CKPT_CLASS[ckpt] == 2:
346
+ frame_preds_list, video_pred_list = test_two_class(data_loader_val, model, device)
347
+ if ckpt == 'DfD-Checkpoint_Fine-tuned_on_FF++':
348
+ prob = sum(video_pred_list) / len(video_pred_list)
349
+ label = "Deepfake" if prob <= 0.5 else "Real"
350
+ prob = prob if label == "Real" else 1 - prob
351
+ frame_results = "\n".join([f"frame_{frame_indices[i]}: {(frame_preds_list[i] * 100):.1f}%" for i in range(len(frame_indices) - 1)]) if label == "Real" else "\n".join([f"frame_{frame_indices[i]}: {(1 - frame_preds_list[i]) * 100:.1f}%" for i in range(len(frame_indices) - 1)])
352
+
353
+ if ckpt == 'FAS-Checkpoint_Fine-tuned_on_MCIO':
354
+ prob = sum(video_pred_list) / len(video_pred_list)
355
+ label = "Spoofing" if prob <= 0.5 else "Bonafide"
356
+ prob = prob if label == "Bonafide" else 1 - prob
357
+ frame_results = "\n".join([f"frame_{frame_indices[i]}: {(frame_preds_list[i] * 100):.1f}%" for i in range(len(frame_indices) - 1)]) if label == "Bonafide" else "\n".join([f"frame_{frame_indices[i]}: {(1 - frame_preds_list[i]) * 100:.1f}%" for i in range(len(frame_indices) - 1)])
358
+
359
+ video_results = (f"The largest face in this image may be {label} with probability {prob * 100:.1f}%\n \n"
360
+ f"The frame-level detection results:\n{frame_results}")
361
+ return video_results, label
362
+ except Exception as e:
363
+ traceback.print_exc()
364
+ return f"Error occurred. Please provide a clear face video or reduce the number of frames."
365
+
366
+
367
+ # Paths and Constants
368
+ P = os.path.abspath(__file__)
369
+ FRAME_SAVE_PATH = os.path.join(os.path.dirname(P), 'frame')
370
+ CAM_SAVE_PATH = os.path.join(os.path.dirname(P), 'cam')
371
+ CKPT_SAVE_PATH = os.path.join(os.path.dirname(P), 'checkpoints')
372
+ os.makedirs(FRAME_SAVE_PATH, exist_ok=True)
373
+ os.makedirs(CAM_SAVE_PATH, exist_ok=True)
374
+ os.makedirs(CKPT_SAVE_PATH, exist_ok=True)
375
+ CKPT_NAME = [
376
+ '✨Unified-detector_v1_Fine-tuned_on_4_classes',
377
+ 'DfD-Checkpoint_Fine-tuned_on_FF++',
378
+ 'FAS-Checkpoint_Fine-tuned_on_MCIO',
379
+ ]
380
+ CKPT_PATH = {
381
+ '✨Unified-detector_v1_Fine-tuned_on_4_classes': 'finetuned_models/Unified-detector/v1_Fine-tuned_on_4_classes/checkpoint-min_train_loss.pth',
382
+ 'DfD-Checkpoint_Fine-tuned_on_FF++': 'finetuned_models/FF++_c23_32frames/checkpoint-min_val_loss.pth',
383
+ 'FAS-Checkpoint_Fine-tuned_on_MCIO': 'finetuned_models/MCIO_protocol/Both_MCIO/checkpoint-min_val_loss.pth',
384
+ }
385
+ CKPT_CLASS = {
386
+ '✨Unified-detector_v1_Fine-tuned_on_4_classes': 4,
387
+ 'DfD-Checkpoint_Fine-tuned_on_FF++': 2,
388
+ 'FAS-Checkpoint_Fine-tuned_on_MCIO': 2
389
+ }
390
+ CKPT_MODEL = {
391
+ '✨Unified-detector_v1_Fine-tuned_on_4_classes': 'vit_base_patch16',
392
+ 'DfD-Checkpoint_Fine-tuned_on_FF++': 'vit_base_patch16',
393
+ 'FAS-Checkpoint_Fine-tuned_on_MCIO': 'vit_base_patch16',
394
+ }
395
+
396
+ with gr.Blocks(css=".custom-label { font-weight: bold !important; font-size: 16px !important; }") as demo:
397
+ gr.HTML(
398
+ "<h1 style='text-align: center;'>🦱 Real Facial Image&Video Detection <br> Against Face Forgery (Deepfake/Diffusion) and Spoofing (Presentation-attacks)</h1>")
399
+ gr.Markdown(
400
+ "<b>☉ Powered by the fine-tuned ViT models that is pre-trained from [FSFM-3C](https://fsfm-3c.github.io/)</b> <br> "
401
+ "<b>☉ We do not and cannot access or store the data you have uploaded!</b> <br> "
402
+ "<b>☉ Release (Continuously updating [by [Gaojian Wang/汪高健](https://scholar.google.com/citations?user=tpP4cFQAAAAJ&hl=zh-CN&oi=ao), [Tong Wu/吴桐](https://github.com/Coco-T-T), [Xingtang Luo/罗兴塘](https://github.com/Rox-C)]) </b> <br> <b>[V1.0] 2025/02/22-Current🎉</b>: "
403
+ "1) Updated <b>[✨Unified-detector_v1] for Physical-Digital Face Attack&Forgery Detection, a ViT-B/16-224 (FSFM Pre-trained) detector that could identify Real&Bonafide, Deepfake, Diffusion&AIGC, Spooing&Presentation-attacks facial images or videos </b> ; 2) Provided the selection of the number of video frames (uniformly sampling 1-32 frames, more frames may time-consuming for this page without GPU acceleration); 3) Fixed some errors of V0.1. <br>"
404
+ "<b>[V0.1] 2024/12-2025/02/21</b>: "
405
+ "Create this page with basic detectors [DfD-Checkpoint_Fine-tuned_on_FF++, FAS-Checkpoint_Fine-tuned_on_MCIO] that follow the paper implementation. <br> ")
406
+ gr.Markdown(
407
+ "- Please <b>provide a facial image or video(<100s)</b>, and <b>select the model</b> for detection: <br> <b>[SUGGEST] [✨Unified-detector_v1_Fine-tuned_on_4_classes]</b> a (FSFM Pre-trained) ViT-B/16-224 for Both Real/Deepfake/Diffusion/Spoofing facial images&videos Detection <br> <b>[DfD-Checkpoint_Fine-tuned_on_FF++]</b> for deepfake detection, FSFM ViT-B/16-224 fine-tuned on the FF++_c23 train&val sets (4 manipulations, 32 frames per video) <br> <b>[FAS-Checkpoint_Fine-tuned_on_MCIO]</b> for face anti-spoofing, FSFM ViT-B/16-224 fine-tuned on the MCIO datasets (2 frames per video)")
408
+
409
+ with gr.Row():
410
+ ckpt_select_dropdown = gr.Dropdown(
411
+ label="Select the Model for Detection ⬇️",
412
+ elem_classes="custom-label",
413
+ choices=['Choose Model Here 🖱️'] + CKPT_NAME + ['continuously updating...'],
414
+ multiselect=False,
415
+ value='Choose Model Here 🖱️',
416
+ interactive=True,
417
+ )
418
+ model_loading_status = gr.Textbox(label="Model Loading Status")
419
+ with gr.Row():
420
+ with gr.Column(scale=5):
421
+ gr.Markdown(
422
+ "### Image Detection (Fast Try: copying image from [whichfaceisreal](https://whichfaceisreal.com/))")
423
+ with gr.Row():
424
+ image = gr.Image(label="Upload/Capture/Paste your image", type="pil")
425
+ with gr.Column():
426
+ output_results_image = gr.Textbox(label="Detection Result")
427
+ image_submit_btn = gr.Button("Submit")
428
+ output_results_image_detail = gr.Textbox(label="Detection Result Detail")
429
+ image_visualize_btn = gr.Button("Visualize")
430
+ output_heatmap = gr.Image(label="Grad CAM Visualizations")
431
+
432
+ with gr.Column(scale=5):
433
+ gr.Markdown("### Video Detection")
434
+ with gr.Row():
435
+ video = gr.Video(label="Upload/Capture your video")
436
+ with gr.Column():
437
+ output_results_video = gr.Textbox(label="Detection Result")
438
+ video_submit_btn = gr.Button("Submit")
439
+ frame_slider = gr.Slider(minimum=1, maximum=32, step=1, value=32, label="Number of Frames for Detection")
440
+ with gr.Accordion("Detection Result Detail", open=False):
441
+ output_results_video_detail = gr.Textbox(label="",lines = 10)
442
+ video_visualize_btn = gr.Button("Visualize")
443
+ output_video_heatmap = gr.Gallery(label="Grad CAM Visualizations")
444
+
445
+ gr.HTML(
446
+ '<div style="display: flex; justify-content: center; gap: 20px; margin-bottom: 20px;">'
447
+ '<a href="https://mapmyvisitors.com/web/1bxvi" title="Visit tracker">'
448
+ '<img src="https://mapmyvisitors.com/map.png?d=FYhBoxLDEaFAxdfRzk5TuchYOBGrnSa98Ky59EkEEpY&cl=ffffff">'
449
+ '</a>'
450
+ '</div>'
451
+ )
452
+
453
+ ckpt_select_dropdown.change(
454
+ fn=load_model,
455
+ inputs=[ckpt_select_dropdown],
456
+ outputs=[ckpt_select_dropdown, model_loading_status],
457
+ )
458
+ image_submit_btn.click(
459
+ fn=FSFM3C_image_detection,
460
+ inputs=[image],
461
+ outputs=[output_results_image_detail,output_results_image],
462
+ )
463
+ image_visualize_btn.click(
464
+ fn=image_Grad_CAM,
465
+ inputs=[image,output_results_image],
466
+ outputs=[output_heatmap]
467
+ )
468
+ video_submit_btn.click(
469
+ fn=FSFM3C_video_detection,
470
+ inputs=[video, frame_slider],
471
+ outputs=[output_results_video_detail,output_results_video],
472
+ )
473
+ video_visualize_btn.click(
474
+ fn=video_Grad_CAM,
475
+ inputs=[output_results_video],
476
+ outputs=[output_video_heatmap],
477
+ )
478
+ if __name__ == "__main__":
479
+ args = get_args_parser()
480
+ args = args.parse_args()
481
+ ckpt = '✨Unified-detector_v1_Fine-tuned_on_4_classes'
482
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
483
+ args.nb_classes = CKPT_CLASS[ckpt]
484
+ model = models_vit.__dict__[CKPT_MODEL[ckpt]](
485
+ num_classes=args.nb_classes,
486
+ drop_path_rate=args.drop_path,
487
+ global_pool=args.global_pool,
488
+ ).to(device)
489
+ args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_PATH[ckpt])
490
+ if os.path.isfile(args.resume) == False:
491
+ hf_hub_download(local_dir=CKPT_SAVE_PATH,
492
+ local_dir_use_symlinks=False,
493
+ repo_id='Wolowolo/fsfm-3c',
494
+ filename=CKPT_PATH[ckpt])
495
+ args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_PATH[ckpt])
496
+ checkpoint = torch.load(args.resume, map_location=device)
497
+ model.load_state_dict(checkpoint['model'], strict=False)
498
+ model.eval()
499
+
500
+ gr.close_all()
501
+ demo.queue()
502
  demo.launch()