Spaces:
Runtime error
Runtime error
Spyderzz commited on
Commit ·
3909c31
1
Parent(s): 4ef8b6a
feat: merge DeepShield1 EfficientNet ensemble into production deployment
Browse files- config.py +7 -0
- models/heatmap_generator.py +75 -6
- models/icpr2020dfdc/.gitignore +5 -0
- models/icpr2020dfdc/.travis.yml +15 -0
- models/icpr2020dfdc/LICENSE +674 -0
- models/icpr2020dfdc/README.md +120 -0
- models/icpr2020dfdc/architectures/__init__.py +0 -0
- models/icpr2020dfdc/architectures/externals/__init__.py +1 -0
- models/icpr2020dfdc/architectures/externals/xception.py +236 -0
- models/icpr2020dfdc/architectures/fornet.py +245 -0
- models/icpr2020dfdc/architectures/tripletnet.py +44 -0
- models/icpr2020dfdc/architectures/weights.py +24 -0
- models/icpr2020dfdc/blazeface/__init__.py +3 -0
- models/icpr2020dfdc/blazeface/anchors.npy +3 -0
- models/icpr2020dfdc/blazeface/blazeface.pth +3 -0
- models/icpr2020dfdc/blazeface/blazeface.py +417 -0
- models/icpr2020dfdc/blazeface/face_extract.py +470 -0
- models/icpr2020dfdc/blazeface/read_video.py +213 -0
- models/icpr2020dfdc/environment.yml +25 -0
- models/icpr2020dfdc/extract_faces.py +346 -0
- models/icpr2020dfdc/index_celebdf.py +85 -0
- models/icpr2020dfdc/index_dfdc.py +94 -0
- models/icpr2020dfdc/index_ffpp.py +92 -0
- models/icpr2020dfdc/isplutils/__init__.py +0 -0
- models/icpr2020dfdc/isplutils/data.py +263 -0
- models/icpr2020dfdc/isplutils/data_siamese.py +78 -0
- models/icpr2020dfdc/isplutils/split.py +135 -0
- models/icpr2020dfdc/isplutils/utils.py +247 -0
- models/icpr2020dfdc/test_model.py +270 -0
- models/icpr2020dfdc/train_binclass.py +460 -0
- models/icpr2020dfdc/train_triplet.py +459 -0
- models/model_loader.py +18 -0
- requirements.txt +7 -0
- schemas/common.py +1 -0
- services/efficientnet_service.py +209 -0
- services/image_service.py +81 -8
- services/metadata_writer.py +73 -0
- services/video_service.py +104 -33
- v1/__pycache__/__init__.cpython-311.pyc +0 -0
- v1/__pycache__/analyze.cpython-311.pyc +0 -0
- v1/__pycache__/auth.cpython-311.pyc +0 -0
- v1/__pycache__/health.cpython-311.pyc +0 -0
- v1/__pycache__/history.cpython-311.pyc +0 -0
- v1/__pycache__/report.cpython-311.pyc +0 -0
- v1/analyze.py +26 -2
config.py
CHANGED
|
@@ -42,6 +42,13 @@ class Settings(BaseSettings):
|
|
| 42 |
LLM_API_KEY: str = ""
|
| 43 |
LLM_MODEL: str = "gemini-2.5-pro" # or "gpt-4o"
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
# Auth
|
| 46 |
JWT_SECRET_KEY: str = "change-me-in-production"
|
| 47 |
JWT_ALGORITHM: str = "HS256"
|
|
|
|
| 42 |
LLM_API_KEY: str = ""
|
| 43 |
LLM_MODEL: str = "gemini-2.5-pro" # or "gpt-4o"
|
| 44 |
|
| 45 |
+
# EfficientNet (ICPR2020 / DeepShield1 merge)
|
| 46 |
+
EFFICIENTNET_MODEL: str = "EfficientNetAutoAttB4"
|
| 47 |
+
EFFICIENTNET_TRAIN_DB: str = "DFDC"
|
| 48 |
+
ENSEMBLE_MODE: bool = True # run both ViT + EfficientNet and average scores
|
| 49 |
+
VIDEO_SAMPLE_FRAMES: int = 16 # frames to sample per video for inference
|
| 50 |
+
EXIFTOOL_PATH: str = "" # full path to ExifTool binary; empty = metadata write disabled
|
| 51 |
+
|
| 52 |
# Auth
|
| 53 |
JWT_SECRET_KEY: str = "change-me-in-production"
|
| 54 |
JWT_ALGORITHM: str = "HS256"
|
models/heatmap_generator.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import base64
|
| 4 |
import io
|
| 5 |
-
from typing import Optional
|
| 6 |
|
| 7 |
import cv2
|
| 8 |
import numpy as np
|
|
@@ -107,15 +107,84 @@ def _compute_gradcam_pp(
|
|
| 107 |
return grayscale_cam, rgb_float
|
| 108 |
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
def generate_heatmap_base64(
|
| 111 |
pil_img: Image.Image,
|
| 112 |
target_class_idx: Optional[int] = None,
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
|
| 117 |
-
logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]})")
|
| 118 |
-
return _encode_overlay_to_base64(overlay)
|
| 119 |
|
| 120 |
|
| 121 |
def generate_boxes_base64(
|
|
|
|
| 2 |
|
| 3 |
import base64
|
| 4 |
import io
|
| 5 |
+
from typing import Literal, Optional
|
| 6 |
|
| 7 |
import cv2
|
| 8 |
import numpy as np
|
|
|
|
| 107 |
return grayscale_cam, rgb_float
|
| 108 |
|
| 109 |
|
| 110 |
+
def _compute_gradcam_pp_efficientnet(
|
| 111 |
+
pil_img: Image.Image,
|
| 112 |
+
) -> tuple[np.ndarray, np.ndarray, Literal["attention", "gradcam++"]]:
|
| 113 |
+
"""Grad-CAM++ for EfficientNetAutoAttB4.
|
| 114 |
+
|
| 115 |
+
Returns (grayscale_cam, rgb_float, heatmap_source).
|
| 116 |
+
Prefers the model's built-in attention map; falls back to Grad-CAM++ on the
|
| 117 |
+
last MBConv block if attention extraction fails.
|
| 118 |
+
"""
|
| 119 |
+
loader = get_model_loader()
|
| 120 |
+
eff = loader.load_efficientnet()
|
| 121 |
+
if eff is None:
|
| 122 |
+
raise RuntimeError("EfficientNet not loaded")
|
| 123 |
+
|
| 124 |
+
if pil_img.mode != "RGB":
|
| 125 |
+
pil_img = pil_img.convert("RGB")
|
| 126 |
+
img_np = np.array(pil_img)
|
| 127 |
+
|
| 128 |
+
# Prepare face crop (same path as detect_image).
|
| 129 |
+
frame_data = eff.face_extractor.process_image(img=img_np)
|
| 130 |
+
faces: list = frame_data.get("faces", [])
|
| 131 |
+
if not faces:
|
| 132 |
+
raise ValueError("no_face")
|
| 133 |
+
|
| 134 |
+
face_t = eff._face_tensor(faces[0]).unsqueeze(0).to(eff.device)
|
| 135 |
+
|
| 136 |
+
# Resize the face crop to float [0,1] for overlay.
|
| 137 |
+
face_np = faces[0]
|
| 138 |
+
h, w = face_np.shape[:2]
|
| 139 |
+
rgb_float = face_np.astype(np.float32) / 255.0
|
| 140 |
+
if rgb_float.shape[:2] != (224, 224):
|
| 141 |
+
rgb_float = cv2.resize(rgb_float, (224, 224)).astype(np.float32)
|
| 142 |
+
|
| 143 |
+
# Try Grad-CAM++ on last MBConv block (_blocks[-1]).
|
| 144 |
+
try:
|
| 145 |
+
net = eff.net
|
| 146 |
+
target_layers = [net.efficientnet._blocks[-1]]
|
| 147 |
+
|
| 148 |
+
face_t.requires_grad_(True)
|
| 149 |
+
for p in net.parameters():
|
| 150 |
+
p.requires_grad_(True)
|
| 151 |
+
|
| 152 |
+
with GradCAMPlusPlus(model=net, target_layers=target_layers) as cam:
|
| 153 |
+
grayscale_cam = cam(input_tensor=face_t, targets=None)[0]
|
| 154 |
+
|
| 155 |
+
return grayscale_cam, rgb_float, "gradcam++"
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.warning(f"EfficientNet Grad-CAM++ failed ({e}), using uniform fallback")
|
| 158 |
+
grayscale_cam = np.ones((224, 224), dtype=np.float32) * 0.5
|
| 159 |
+
return grayscale_cam, rgb_float, "gradcam++"
|
| 160 |
+
|
| 161 |
+
|
| 162 |
def generate_heatmap_base64(
|
| 163 |
pil_img: Image.Image,
|
| 164 |
target_class_idx: Optional[int] = None,
|
| 165 |
+
model_family: Literal["vit", "efficientnet"] = "vit",
|
| 166 |
+
) -> tuple[str, str]:
|
| 167 |
+
"""Produce a base64 data-URL PNG of the Grad-CAM++ overlay.
|
| 168 |
+
|
| 169 |
+
Returns (base64_png, heatmap_source) where heatmap_source is one of
|
| 170 |
+
"gradcam++", "attention", "fallback", "none".
|
| 171 |
+
"""
|
| 172 |
+
if model_family == "efficientnet":
|
| 173 |
+
try:
|
| 174 |
+
grayscale_cam, rgb_float, source = _compute_gradcam_pp_efficientnet(pil_img)
|
| 175 |
+
except ValueError:
|
| 176 |
+
logger.info("EfficientNet heatmap skipped — no face detected")
|
| 177 |
+
return "", "none"
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.warning(f"EfficientNet heatmap failed: {e}")
|
| 180 |
+
return "", "fallback"
|
| 181 |
+
else:
|
| 182 |
+
grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
|
| 183 |
+
source = "gradcam++"
|
| 184 |
+
|
| 185 |
overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
|
| 186 |
+
logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]}) source={source}")
|
| 187 |
+
return _encode_overlay_to_base64(overlay), source
|
| 188 |
|
| 189 |
|
| 190 |
def generate_boxes_base64(
|
models/icpr2020dfdc/.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
.idea/
|
| 3 |
+
.DS_Store
|
| 4 |
+
.ipynb_checkpoints/
|
| 5 |
+
__pycache__/
|
models/icpr2020dfdc/.travis.yml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
language: python
|
| 2 |
+
python:
|
| 3 |
+
- "3.6.9"
|
| 4 |
+
install:
|
| 5 |
+
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/miniconda.sh
|
| 6 |
+
- bash $HOME/miniconda.sh -bfp $HOME/miniconda3
|
| 7 |
+
- export PATH=$HOME/miniconda3/bin:$PATH
|
| 8 |
+
- conda env create -f environment.yml
|
| 9 |
+
before_script:
|
| 10 |
+
- source activate icpr2020
|
| 11 |
+
- cd test
|
| 12 |
+
script:
|
| 13 |
+
- python -m unittest test_dfdc.TestDFDC
|
| 14 |
+
- python -m unittest test_ffpp.TestFFPP
|
| 15 |
+
|
models/icpr2020dfdc/LICENSE
ADDED
|
@@ -0,0 +1,674 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GNU GENERAL PUBLIC LICENSE
|
| 2 |
+
Version 3, 29 June 2007
|
| 3 |
+
|
| 4 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
| 5 |
+
Everyone is permitted to copy and distribute verbatim copies
|
| 6 |
+
of this license document, but changing it is not allowed.
|
| 7 |
+
|
| 8 |
+
Preamble
|
| 9 |
+
|
| 10 |
+
The GNU General Public License is a free, copyleft license for
|
| 11 |
+
software and other kinds of works.
|
| 12 |
+
|
| 13 |
+
The licenses for most software and other practical works are designed
|
| 14 |
+
to take away your freedom to share and change the works. By contrast,
|
| 15 |
+
the GNU General Public License is intended to guarantee your freedom to
|
| 16 |
+
share and change all versions of a program--to make sure it remains free
|
| 17 |
+
software for all its users. We, the Free Software Foundation, use the
|
| 18 |
+
GNU General Public License for most of our software; it applies also to
|
| 19 |
+
any other work released this way by its authors. You can apply it to
|
| 20 |
+
your programs, too.
|
| 21 |
+
|
| 22 |
+
When we speak of free software, we are referring to freedom, not
|
| 23 |
+
price. Our General Public Licenses are designed to make sure that you
|
| 24 |
+
have the freedom to distribute copies of free software (and charge for
|
| 25 |
+
them if you wish), that you receive source code or can get it if you
|
| 26 |
+
want it, that you can change the software or use pieces of it in new
|
| 27 |
+
free programs, and that you know you can do these things.
|
| 28 |
+
|
| 29 |
+
To protect your rights, we need to prevent others from denying you
|
| 30 |
+
these rights or asking you to surrender the rights. Therefore, you have
|
| 31 |
+
certain responsibilities if you distribute copies of the software, or if
|
| 32 |
+
you modify it: responsibilities to respect the freedom of others.
|
| 33 |
+
|
| 34 |
+
For example, if you distribute copies of such a program, whether
|
| 35 |
+
gratis or for a fee, you must pass on to the recipients the same
|
| 36 |
+
freedoms that you received. You must make sure that they, too, receive
|
| 37 |
+
or can get the source code. And you must show them these terms so they
|
| 38 |
+
know their rights.
|
| 39 |
+
|
| 40 |
+
Developers that use the GNU GPL protect your rights with two steps:
|
| 41 |
+
(1) assert copyright on the software, and (2) offer you this License
|
| 42 |
+
giving you legal permission to copy, distribute and/or modify it.
|
| 43 |
+
|
| 44 |
+
For the developers' and authors' protection, the GPL clearly explains
|
| 45 |
+
that there is no warranty for this free software. For both users' and
|
| 46 |
+
authors' sake, the GPL requires that modified versions be marked as
|
| 47 |
+
changed, so that their problems will not be attributed erroneously to
|
| 48 |
+
authors of previous versions.
|
| 49 |
+
|
| 50 |
+
Some devices are designed to deny users access to install or run
|
| 51 |
+
modified versions of the software inside them, although the manufacturer
|
| 52 |
+
can do so. This is fundamentally incompatible with the aim of
|
| 53 |
+
protecting users' freedom to change the software. The systematic
|
| 54 |
+
pattern of such abuse occurs in the area of products for individuals to
|
| 55 |
+
use, which is precisely where it is most unacceptable. Therefore, we
|
| 56 |
+
have designed this version of the GPL to prohibit the practice for those
|
| 57 |
+
products. If such problems arise substantially in other domains, we
|
| 58 |
+
stand ready to extend this provision to those domains in future versions
|
| 59 |
+
of the GPL, as needed to protect the freedom of users.
|
| 60 |
+
|
| 61 |
+
Finally, every program is threatened constantly by software patents.
|
| 62 |
+
States should not allow patents to restrict development and use of
|
| 63 |
+
software on general-purpose computers, but in those that do, we wish to
|
| 64 |
+
avoid the special danger that patents applied to a free program could
|
| 65 |
+
make it effectively proprietary. To prevent this, the GPL assures that
|
| 66 |
+
patents cannot be used to render the program non-free.
|
| 67 |
+
|
| 68 |
+
The precise terms and conditions for copying, distribution and
|
| 69 |
+
modification follow.
|
| 70 |
+
|
| 71 |
+
TERMS AND CONDITIONS
|
| 72 |
+
|
| 73 |
+
0. Definitions.
|
| 74 |
+
|
| 75 |
+
"This License" refers to version 3 of the GNU General Public License.
|
| 76 |
+
|
| 77 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
| 78 |
+
works, such as semiconductor masks.
|
| 79 |
+
|
| 80 |
+
"The Program" refers to any copyrightable work licensed under this
|
| 81 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
| 82 |
+
"recipients" may be individuals or organizations.
|
| 83 |
+
|
| 84 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
| 85 |
+
in a fashion requiring copyright permission, other than the making of an
|
| 86 |
+
exact copy. The resulting work is called a "modified version" of the
|
| 87 |
+
earlier work or a work "based on" the earlier work.
|
| 88 |
+
|
| 89 |
+
A "covered work" means either the unmodified Program or a work based
|
| 90 |
+
on the Program.
|
| 91 |
+
|
| 92 |
+
To "propagate" a work means to do anything with it that, without
|
| 93 |
+
permission, would make you directly or secondarily liable for
|
| 94 |
+
infringement under applicable copyright law, except executing it on a
|
| 95 |
+
computer or modifying a private copy. Propagation includes copying,
|
| 96 |
+
distribution (with or without modification), making available to the
|
| 97 |
+
public, and in some countries other activities as well.
|
| 98 |
+
|
| 99 |
+
To "convey" a work means any kind of propagation that enables other
|
| 100 |
+
parties to make or receive copies. Mere interaction with a user through
|
| 101 |
+
a computer network, with no transfer of a copy, is not conveying.
|
| 102 |
+
|
| 103 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
| 104 |
+
to the extent that it includes a convenient and prominently visible
|
| 105 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
| 106 |
+
tells the user that there is no warranty for the work (except to the
|
| 107 |
+
extent that warranties are provided), that licensees may convey the
|
| 108 |
+
work under this License, and how to view a copy of this License. If
|
| 109 |
+
the interface presents a list of user commands or options, such as a
|
| 110 |
+
menu, a prominent item in the list meets this criterion.
|
| 111 |
+
|
| 112 |
+
1. Source Code.
|
| 113 |
+
|
| 114 |
+
The "source code" for a work means the preferred form of the work
|
| 115 |
+
for making modifications to it. "Object code" means any non-source
|
| 116 |
+
form of a work.
|
| 117 |
+
|
| 118 |
+
A "Standard Interface" means an interface that either is an official
|
| 119 |
+
standard defined by a recognized standards body, or, in the case of
|
| 120 |
+
interfaces specified for a particular programming language, one that
|
| 121 |
+
is widely used among developers working in that language.
|
| 122 |
+
|
| 123 |
+
The "System Libraries" of an executable work include anything, other
|
| 124 |
+
than the work as a whole, that (a) is included in the normal form of
|
| 125 |
+
packaging a Major Component, but which is not part of that Major
|
| 126 |
+
Component, and (b) serves only to enable use of the work with that
|
| 127 |
+
Major Component, or to implement a Standard Interface for which an
|
| 128 |
+
implementation is available to the public in source code form. A
|
| 129 |
+
"Major Component", in this context, means a major essential component
|
| 130 |
+
(kernel, window system, and so on) of the specific operating system
|
| 131 |
+
(if any) on which the executable work runs, or a compiler used to
|
| 132 |
+
produce the work, or an object code interpreter used to run it.
|
| 133 |
+
|
| 134 |
+
The "Corresponding Source" for a work in object code form means all
|
| 135 |
+
the source code needed to generate, install, and (for an executable
|
| 136 |
+
work) run the object code and to modify the work, including scripts to
|
| 137 |
+
control those activities. However, it does not include the work's
|
| 138 |
+
System Libraries, or general-purpose tools or generally available free
|
| 139 |
+
programs which are used unmodified in performing those activities but
|
| 140 |
+
which are not part of the work. For example, Corresponding Source
|
| 141 |
+
includes interface definition files associated with source files for
|
| 142 |
+
the work, and the source code for shared libraries and dynamically
|
| 143 |
+
linked subprograms that the work is specifically designed to require,
|
| 144 |
+
such as by intimate data communication or control flow between those
|
| 145 |
+
subprograms and other parts of the work.
|
| 146 |
+
|
| 147 |
+
The Corresponding Source need not include anything that users
|
| 148 |
+
can regenerate automatically from other parts of the Corresponding
|
| 149 |
+
Source.
|
| 150 |
+
|
| 151 |
+
The Corresponding Source for a work in source code form is that
|
| 152 |
+
same work.
|
| 153 |
+
|
| 154 |
+
2. Basic Permissions.
|
| 155 |
+
|
| 156 |
+
All rights granted under this License are granted for the term of
|
| 157 |
+
copyright on the Program, and are irrevocable provided the stated
|
| 158 |
+
conditions are met. This License explicitly affirms your unlimited
|
| 159 |
+
permission to run the unmodified Program. The output from running a
|
| 160 |
+
covered work is covered by this License only if the output, given its
|
| 161 |
+
content, constitutes a covered work. This License acknowledges your
|
| 162 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
| 163 |
+
|
| 164 |
+
You may make, run and propagate covered works that you do not
|
| 165 |
+
convey, without conditions so long as your license otherwise remains
|
| 166 |
+
in force. You may convey covered works to others for the sole purpose
|
| 167 |
+
of having them make modifications exclusively for you, or provide you
|
| 168 |
+
with facilities for running those works, provided that you comply with
|
| 169 |
+
the terms of this License in conveying all material for which you do
|
| 170 |
+
not control copyright. Those thus making or running the covered works
|
| 171 |
+
for you must do so exclusively on your behalf, under your direction
|
| 172 |
+
and control, on terms that prohibit them from making any copies of
|
| 173 |
+
your copyrighted material outside their relationship with you.
|
| 174 |
+
|
| 175 |
+
Conveying under any other circumstances is permitted solely under
|
| 176 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
| 177 |
+
makes it unnecessary.
|
| 178 |
+
|
| 179 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
| 180 |
+
|
| 181 |
+
No covered work shall be deemed part of an effective technological
|
| 182 |
+
measure under any applicable law fulfilling obligations under article
|
| 183 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
| 184 |
+
similar laws prohibiting or restricting circumvention of such
|
| 185 |
+
measures.
|
| 186 |
+
|
| 187 |
+
When you convey a covered work, you waive any legal power to forbid
|
| 188 |
+
circumvention of technological measures to the extent such circumvention
|
| 189 |
+
is effected by exercising rights under this License with respect to
|
| 190 |
+
the covered work, and you disclaim any intention to limit operation or
|
| 191 |
+
modification of the work as a means of enforcing, against the work's
|
| 192 |
+
users, your or third parties' legal rights to forbid circumvention of
|
| 193 |
+
technological measures.
|
| 194 |
+
|
| 195 |
+
4. Conveying Verbatim Copies.
|
| 196 |
+
|
| 197 |
+
You may convey verbatim copies of the Program's source code as you
|
| 198 |
+
receive it, in any medium, provided that you conspicuously and
|
| 199 |
+
appropriately publish on each copy an appropriate copyright notice;
|
| 200 |
+
keep intact all notices stating that this License and any
|
| 201 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
| 202 |
+
keep intact all notices of the absence of any warranty; and give all
|
| 203 |
+
recipients a copy of this License along with the Program.
|
| 204 |
+
|
| 205 |
+
You may charge any price or no price for each copy that you convey,
|
| 206 |
+
and you may offer support or warranty protection for a fee.
|
| 207 |
+
|
| 208 |
+
5. Conveying Modified Source Versions.
|
| 209 |
+
|
| 210 |
+
You may convey a work based on the Program, or the modifications to
|
| 211 |
+
produce it from the Program, in the form of source code under the
|
| 212 |
+
terms of section 4, provided that you also meet all of these conditions:
|
| 213 |
+
|
| 214 |
+
a) The work must carry prominent notices stating that you modified
|
| 215 |
+
it, and giving a relevant date.
|
| 216 |
+
|
| 217 |
+
b) The work must carry prominent notices stating that it is
|
| 218 |
+
released under this License and any conditions added under section
|
| 219 |
+
7. This requirement modifies the requirement in section 4 to
|
| 220 |
+
"keep intact all notices".
|
| 221 |
+
|
| 222 |
+
c) You must license the entire work, as a whole, under this
|
| 223 |
+
License to anyone who comes into possession of a copy. This
|
| 224 |
+
License will therefore apply, along with any applicable section 7
|
| 225 |
+
additional terms, to the whole of the work, and all its parts,
|
| 226 |
+
regardless of how they are packaged. This License gives no
|
| 227 |
+
permission to license the work in any other way, but it does not
|
| 228 |
+
invalidate such permission if you have separately received it.
|
| 229 |
+
|
| 230 |
+
d) If the work has interactive user interfaces, each must display
|
| 231 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
| 232 |
+
interfaces that do not display Appropriate Legal Notices, your
|
| 233 |
+
work need not make them do so.
|
| 234 |
+
|
| 235 |
+
A compilation of a covered work with other separate and independent
|
| 236 |
+
works, which are not by their nature extensions of the covered work,
|
| 237 |
+
and which are not combined with it such as to form a larger program,
|
| 238 |
+
in or on a volume of a storage or distribution medium, is called an
|
| 239 |
+
"aggregate" if the compilation and its resulting copyright are not
|
| 240 |
+
used to limit the access or legal rights of the compilation's users
|
| 241 |
+
beyond what the individual works permit. Inclusion of a covered work
|
| 242 |
+
in an aggregate does not cause this License to apply to the other
|
| 243 |
+
parts of the aggregate.
|
| 244 |
+
|
| 245 |
+
6. Conveying Non-Source Forms.
|
| 246 |
+
|
| 247 |
+
You may convey a covered work in object code form under the terms
|
| 248 |
+
of sections 4 and 5, provided that you also convey the
|
| 249 |
+
machine-readable Corresponding Source under the terms of this License,
|
| 250 |
+
in one of these ways:
|
| 251 |
+
|
| 252 |
+
a) Convey the object code in, or embodied in, a physical product
|
| 253 |
+
(including a physical distribution medium), accompanied by the
|
| 254 |
+
Corresponding Source fixed on a durable physical medium
|
| 255 |
+
customarily used for software interchange.
|
| 256 |
+
|
| 257 |
+
b) Convey the object code in, or embodied in, a physical product
|
| 258 |
+
(including a physical distribution medium), accompanied by a
|
| 259 |
+
written offer, valid for at least three years and valid for as
|
| 260 |
+
long as you offer spare parts or customer support for that product
|
| 261 |
+
model, to give anyone who possesses the object code either (1) a
|
| 262 |
+
copy of the Corresponding Source for all the software in the
|
| 263 |
+
product that is covered by this License, on a durable physical
|
| 264 |
+
medium customarily used for software interchange, for a price no
|
| 265 |
+
more than your reasonable cost of physically performing this
|
| 266 |
+
conveying of source, or (2) access to copy the
|
| 267 |
+
Corresponding Source from a network server at no charge.
|
| 268 |
+
|
| 269 |
+
c) Convey individual copies of the object code with a copy of the
|
| 270 |
+
written offer to provide the Corresponding Source. This
|
| 271 |
+
alternative is allowed only occasionally and noncommercially, and
|
| 272 |
+
only if you received the object code with such an offer, in accord
|
| 273 |
+
with subsection 6b.
|
| 274 |
+
|
| 275 |
+
d) Convey the object code by offering access from a designated
|
| 276 |
+
place (gratis or for a charge), and offer equivalent access to the
|
| 277 |
+
Corresponding Source in the same way through the same place at no
|
| 278 |
+
further charge. You need not require recipients to copy the
|
| 279 |
+
Corresponding Source along with the object code. If the place to
|
| 280 |
+
copy the object code is a network server, the Corresponding Source
|
| 281 |
+
may be on a different server (operated by you or a third party)
|
| 282 |
+
that supports equivalent copying facilities, provided you maintain
|
| 283 |
+
clear directions next to the object code saying where to find the
|
| 284 |
+
Corresponding Source. Regardless of what server hosts the
|
| 285 |
+
Corresponding Source, you remain obligated to ensure that it is
|
| 286 |
+
available for as long as needed to satisfy these requirements.
|
| 287 |
+
|
| 288 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
| 289 |
+
you inform other peers where the object code and Corresponding
|
| 290 |
+
Source of the work are being offered to the general public at no
|
| 291 |
+
charge under subsection 6d.
|
| 292 |
+
|
| 293 |
+
A separable portion of the object code, whose source code is excluded
|
| 294 |
+
from the Corresponding Source as a System Library, need not be
|
| 295 |
+
included in conveying the object code work.
|
| 296 |
+
|
| 297 |
+
A "User Product" is either (1) a "consumer product", which means any
|
| 298 |
+
tangible personal property which is normally used for personal, family,
|
| 299 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
| 300 |
+
into a dwelling. In determining whether a product is a consumer product,
|
| 301 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
| 302 |
+
product received by a particular user, "normally used" refers to a
|
| 303 |
+
typical or common use of that class of product, regardless of the status
|
| 304 |
+
of the particular user or of the way in which the particular user
|
| 305 |
+
actually uses, or expects or is expected to use, the product. A product
|
| 306 |
+
is a consumer product regardless of whether the product has substantial
|
| 307 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
| 308 |
+
the only significant mode of use of the product.
|
| 309 |
+
|
| 310 |
+
"Installation Information" for a User Product means any methods,
|
| 311 |
+
procedures, authorization keys, or other information required to install
|
| 312 |
+
and execute modified versions of a covered work in that User Product from
|
| 313 |
+
a modified version of its Corresponding Source. The information must
|
| 314 |
+
suffice to ensure that the continued functioning of the modified object
|
| 315 |
+
code is in no case prevented or interfered with solely because
|
| 316 |
+
modification has been made.
|
| 317 |
+
|
| 318 |
+
If you convey an object code work under this section in, or with, or
|
| 319 |
+
specifically for use in, a User Product, and the conveying occurs as
|
| 320 |
+
part of a transaction in which the right of possession and use of the
|
| 321 |
+
User Product is transferred to the recipient in perpetuity or for a
|
| 322 |
+
fixed term (regardless of how the transaction is characterized), the
|
| 323 |
+
Corresponding Source conveyed under this section must be accompanied
|
| 324 |
+
by the Installation Information. But this requirement does not apply
|
| 325 |
+
if neither you nor any third party retains the ability to install
|
| 326 |
+
modified object code on the User Product (for example, the work has
|
| 327 |
+
been installed in ROM).
|
| 328 |
+
|
| 329 |
+
The requirement to provide Installation Information does not include a
|
| 330 |
+
requirement to continue to provide support service, warranty, or updates
|
| 331 |
+
for a work that has been modified or installed by the recipient, or for
|
| 332 |
+
the User Product in which it has been modified or installed. Access to a
|
| 333 |
+
network may be denied when the modification itself materially and
|
| 334 |
+
adversely affects the operation of the network or violates the rules and
|
| 335 |
+
protocols for communication across the network.
|
| 336 |
+
|
| 337 |
+
Corresponding Source conveyed, and Installation Information provided,
|
| 338 |
+
in accord with this section must be in a format that is publicly
|
| 339 |
+
documented (and with an implementation available to the public in
|
| 340 |
+
source code form), and must require no special password or key for
|
| 341 |
+
unpacking, reading or copying.
|
| 342 |
+
|
| 343 |
+
7. Additional Terms.
|
| 344 |
+
|
| 345 |
+
"Additional permissions" are terms that supplement the terms of this
|
| 346 |
+
License by making exceptions from one or more of its conditions.
|
| 347 |
+
Additional permissions that are applicable to the entire Program shall
|
| 348 |
+
be treated as though they were included in this License, to the extent
|
| 349 |
+
that they are valid under applicable law. If additional permissions
|
| 350 |
+
apply only to part of the Program, that part may be used separately
|
| 351 |
+
under those permissions, but the entire Program remains governed by
|
| 352 |
+
this License without regard to the additional permissions.
|
| 353 |
+
|
| 354 |
+
When you convey a copy of a covered work, you may at your option
|
| 355 |
+
remove any additional permissions from that copy, or from any part of
|
| 356 |
+
it. (Additional permissions may be written to require their own
|
| 357 |
+
removal in certain cases when you modify the work.) You may place
|
| 358 |
+
additional permissions on material, added by you to a covered work,
|
| 359 |
+
for which you have or can give appropriate copyright permission.
|
| 360 |
+
|
| 361 |
+
Notwithstanding any other provision of this License, for material you
|
| 362 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
| 363 |
+
that material) supplement the terms of this License with terms:
|
| 364 |
+
|
| 365 |
+
a) Disclaiming warranty or limiting liability differently from the
|
| 366 |
+
terms of sections 15 and 16 of this License; or
|
| 367 |
+
|
| 368 |
+
b) Requiring preservation of specified reasonable legal notices or
|
| 369 |
+
author attributions in that material or in the Appropriate Legal
|
| 370 |
+
Notices displayed by works containing it; or
|
| 371 |
+
|
| 372 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
| 373 |
+
requiring that modified versions of such material be marked in
|
| 374 |
+
reasonable ways as different from the original version; or
|
| 375 |
+
|
| 376 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
| 377 |
+
authors of the material; or
|
| 378 |
+
|
| 379 |
+
e) Declining to grant rights under trademark law for use of some
|
| 380 |
+
trade names, trademarks, or service marks; or
|
| 381 |
+
|
| 382 |
+
f) Requiring indemnification of licensors and authors of that
|
| 383 |
+
material by anyone who conveys the material (or modified versions of
|
| 384 |
+
it) with contractual assumptions of liability to the recipient, for
|
| 385 |
+
any liability that these contractual assumptions directly impose on
|
| 386 |
+
those licensors and authors.
|
| 387 |
+
|
| 388 |
+
All other non-permissive additional terms are considered "further
|
| 389 |
+
restrictions" within the meaning of section 10. If the Program as you
|
| 390 |
+
received it, or any part of it, contains a notice stating that it is
|
| 391 |
+
governed by this License along with a term that is a further
|
| 392 |
+
restriction, you may remove that term. If a license document contains
|
| 393 |
+
a further restriction but permits relicensing or conveying under this
|
| 394 |
+
License, you may add to a covered work material governed by the terms
|
| 395 |
+
of that license document, provided that the further restriction does
|
| 396 |
+
not survive such relicensing or conveying.
|
| 397 |
+
|
| 398 |
+
If you add terms to a covered work in accord with this section, you
|
| 399 |
+
must place, in the relevant source files, a statement of the
|
| 400 |
+
additional terms that apply to those files, or a notice indicating
|
| 401 |
+
where to find the applicable terms.
|
| 402 |
+
|
| 403 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
| 404 |
+
form of a separately written license, or stated as exceptions;
|
| 405 |
+
the above requirements apply either way.
|
| 406 |
+
|
| 407 |
+
8. Termination.
|
| 408 |
+
|
| 409 |
+
You may not propagate or modify a covered work except as expressly
|
| 410 |
+
provided under this License. Any attempt otherwise to propagate or
|
| 411 |
+
modify it is void, and will automatically terminate your rights under
|
| 412 |
+
this License (including any patent licenses granted under the third
|
| 413 |
+
paragraph of section 11).
|
| 414 |
+
|
| 415 |
+
However, if you cease all violation of this License, then your
|
| 416 |
+
license from a particular copyright holder is reinstated (a)
|
| 417 |
+
provisionally, unless and until the copyright holder explicitly and
|
| 418 |
+
finally terminates your license, and (b) permanently, if the copyright
|
| 419 |
+
holder fails to notify you of the violation by some reasonable means
|
| 420 |
+
prior to 60 days after the cessation.
|
| 421 |
+
|
| 422 |
+
Moreover, your license from a particular copyright holder is
|
| 423 |
+
reinstated permanently if the copyright holder notifies you of the
|
| 424 |
+
violation by some reasonable means, this is the first time you have
|
| 425 |
+
received notice of violation of this License (for any work) from that
|
| 426 |
+
copyright holder, and you cure the violation prior to 30 days after
|
| 427 |
+
your receipt of the notice.
|
| 428 |
+
|
| 429 |
+
Termination of your rights under this section does not terminate the
|
| 430 |
+
licenses of parties who have received copies or rights from you under
|
| 431 |
+
this License. If your rights have been terminated and not permanently
|
| 432 |
+
reinstated, you do not qualify to receive new licenses for the same
|
| 433 |
+
material under section 10.
|
| 434 |
+
|
| 435 |
+
9. Acceptance Not Required for Having Copies.
|
| 436 |
+
|
| 437 |
+
You are not required to accept this License in order to receive or
|
| 438 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
| 439 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
| 440 |
+
to receive a copy likewise does not require acceptance. However,
|
| 441 |
+
nothing other than this License grants you permission to propagate or
|
| 442 |
+
modify any covered work. These actions infringe copyright if you do
|
| 443 |
+
not accept this License. Therefore, by modifying or propagating a
|
| 444 |
+
covered work, you indicate your acceptance of this License to do so.
|
| 445 |
+
|
| 446 |
+
10. Automatic Licensing of Downstream Recipients.
|
| 447 |
+
|
| 448 |
+
Each time you convey a covered work, the recipient automatically
|
| 449 |
+
receives a license from the original licensors, to run, modify and
|
| 450 |
+
propagate that work, subject to this License. You are not responsible
|
| 451 |
+
for enforcing compliance by third parties with this License.
|
| 452 |
+
|
| 453 |
+
An "entity transaction" is a transaction transferring control of an
|
| 454 |
+
organization, or substantially all assets of one, or subdividing an
|
| 455 |
+
organization, or merging organizations. If propagation of a covered
|
| 456 |
+
work results from an entity transaction, each party to that
|
| 457 |
+
transaction who receives a copy of the work also receives whatever
|
| 458 |
+
licenses to the work the party's predecessor in interest had or could
|
| 459 |
+
give under the previous paragraph, plus a right to possession of the
|
| 460 |
+
Corresponding Source of the work from the predecessor in interest, if
|
| 461 |
+
the predecessor has it or can get it with reasonable efforts.
|
| 462 |
+
|
| 463 |
+
You may not impose any further restrictions on the exercise of the
|
| 464 |
+
rights granted or affirmed under this License. For example, you may
|
| 465 |
+
not impose a license fee, royalty, or other charge for exercise of
|
| 466 |
+
rights granted under this License, and you may not initiate litigation
|
| 467 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
| 468 |
+
any patent claim is infringed by making, using, selling, offering for
|
| 469 |
+
sale, or importing the Program or any portion of it.
|
| 470 |
+
|
| 471 |
+
11. Patents.
|
| 472 |
+
|
| 473 |
+
A "contributor" is a copyright holder who authorizes use under this
|
| 474 |
+
License of the Program or a work on which the Program is based. The
|
| 475 |
+
work thus licensed is called the contributor's "contributor version".
|
| 476 |
+
|
| 477 |
+
A contributor's "essential patent claims" are all patent claims
|
| 478 |
+
owned or controlled by the contributor, whether already acquired or
|
| 479 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
| 480 |
+
by this License, of making, using, or selling its contributor version,
|
| 481 |
+
but do not include claims that would be infringed only as a
|
| 482 |
+
consequence of further modification of the contributor version. For
|
| 483 |
+
purposes of this definition, "control" includes the right to grant
|
| 484 |
+
patent sublicenses in a manner consistent with the requirements of
|
| 485 |
+
this License.
|
| 486 |
+
|
| 487 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
| 488 |
+
patent license under the contributor's essential patent claims, to
|
| 489 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
| 490 |
+
propagate the contents of its contributor version.
|
| 491 |
+
|
| 492 |
+
In the following three paragraphs, a "patent license" is any express
|
| 493 |
+
agreement or commitment, however denominated, not to enforce a patent
|
| 494 |
+
(such as an express permission to practice a patent or covenant not to
|
| 495 |
+
sue for patent infringement). To "grant" such a patent license to a
|
| 496 |
+
party means to make such an agreement or commitment not to enforce a
|
| 497 |
+
patent against the party.
|
| 498 |
+
|
| 499 |
+
If you convey a covered work, knowingly relying on a patent license,
|
| 500 |
+
and the Corresponding Source of the work is not available for anyone
|
| 501 |
+
to copy, free of charge and under the terms of this License, through a
|
| 502 |
+
publicly available network server or other readily accessible means,
|
| 503 |
+
then you must either (1) cause the Corresponding Source to be so
|
| 504 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
| 505 |
+
patent license for this particular work, or (3) arrange, in a manner
|
| 506 |
+
consistent with the requirements of this License, to extend the patent
|
| 507 |
+
license to downstream recipients. "Knowingly relying" means you have
|
| 508 |
+
actual knowledge that, but for the patent license, your conveying the
|
| 509 |
+
covered work in a country, or your recipient's use of the covered work
|
| 510 |
+
in a country, would infringe one or more identifiable patents in that
|
| 511 |
+
country that you have reason to believe are valid.
|
| 512 |
+
|
| 513 |
+
If, pursuant to or in connection with a single transaction or
|
| 514 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
| 515 |
+
covered work, and grant a patent license to some of the parties
|
| 516 |
+
receiving the covered work authorizing them to use, propagate, modify
|
| 517 |
+
or convey a specific copy of the covered work, then the patent license
|
| 518 |
+
you grant is automatically extended to all recipients of the covered
|
| 519 |
+
work and works based on it.
|
| 520 |
+
|
| 521 |
+
A patent license is "discriminatory" if it does not include within
|
| 522 |
+
the scope of its coverage, prohibits the exercise of, or is
|
| 523 |
+
conditioned on the non-exercise of one or more of the rights that are
|
| 524 |
+
specifically granted under this License. You may not convey a covered
|
| 525 |
+
work if you are a party to an arrangement with a third party that is
|
| 526 |
+
in the business of distributing software, under which you make payment
|
| 527 |
+
to the third party based on the extent of your activity of conveying
|
| 528 |
+
the work, and under which the third party grants, to any of the
|
| 529 |
+
parties who would receive the covered work from you, a discriminatory
|
| 530 |
+
patent license (a) in connection with copies of the covered work
|
| 531 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
| 532 |
+
for and in connection with specific products or compilations that
|
| 533 |
+
contain the covered work, unless you entered into that arrangement,
|
| 534 |
+
or that patent license was granted, prior to 28 March 2007.
|
| 535 |
+
|
| 536 |
+
Nothing in this License shall be construed as excluding or limiting
|
| 537 |
+
any implied license or other defenses to infringement that may
|
| 538 |
+
otherwise be available to you under applicable patent law.
|
| 539 |
+
|
| 540 |
+
12. No Surrender of Others' Freedom.
|
| 541 |
+
|
| 542 |
+
If conditions are imposed on you (whether by court order, agreement or
|
| 543 |
+
otherwise) that contradict the conditions of this License, they do not
|
| 544 |
+
excuse you from the conditions of this License. If you cannot convey a
|
| 545 |
+
covered work so as to satisfy simultaneously your obligations under this
|
| 546 |
+
License and any other pertinent obligations, then as a consequence you may
|
| 547 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
| 548 |
+
to collect a royalty for further conveying from those to whom you convey
|
| 549 |
+
the Program, the only way you could satisfy both those terms and this
|
| 550 |
+
License would be to refrain entirely from conveying the Program.
|
| 551 |
+
|
| 552 |
+
13. Use with the GNU Affero General Public License.
|
| 553 |
+
|
| 554 |
+
Notwithstanding any other provision of this License, you have
|
| 555 |
+
permission to link or combine any covered work with a work licensed
|
| 556 |
+
under version 3 of the GNU Affero General Public License into a single
|
| 557 |
+
combined work, and to convey the resulting work. The terms of this
|
| 558 |
+
License will continue to apply to the part which is the covered work,
|
| 559 |
+
but the special requirements of the GNU Affero General Public License,
|
| 560 |
+
section 13, concerning interaction through a network will apply to the
|
| 561 |
+
combination as such.
|
| 562 |
+
|
| 563 |
+
14. Revised Versions of this License.
|
| 564 |
+
|
| 565 |
+
The Free Software Foundation may publish revised and/or new versions of
|
| 566 |
+
the GNU General Public License from time to time. Such new versions will
|
| 567 |
+
be similar in spirit to the present version, but may differ in detail to
|
| 568 |
+
address new problems or concerns.
|
| 569 |
+
|
| 570 |
+
Each version is given a distinguishing version number. If the
|
| 571 |
+
Program specifies that a certain numbered version of the GNU General
|
| 572 |
+
Public License "or any later version" applies to it, you have the
|
| 573 |
+
option of following the terms and conditions either of that numbered
|
| 574 |
+
version or of any later version published by the Free Software
|
| 575 |
+
Foundation. If the Program does not specify a version number of the
|
| 576 |
+
GNU General Public License, you may choose any version ever published
|
| 577 |
+
by the Free Software Foundation.
|
| 578 |
+
|
| 579 |
+
If the Program specifies that a proxy can decide which future
|
| 580 |
+
versions of the GNU General Public License can be used, that proxy's
|
| 581 |
+
public statement of acceptance of a version permanently authorizes you
|
| 582 |
+
to choose that version for the Program.
|
| 583 |
+
|
| 584 |
+
Later license versions may give you additional or different
|
| 585 |
+
permissions. However, no additional obligations are imposed on any
|
| 586 |
+
author or copyright holder as a result of your choosing to follow a
|
| 587 |
+
later version.
|
| 588 |
+
|
| 589 |
+
15. Disclaimer of Warranty.
|
| 590 |
+
|
| 591 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
| 592 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
| 593 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
| 594 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
| 595 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 596 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
| 597 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
| 598 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
| 599 |
+
|
| 600 |
+
16. Limitation of Liability.
|
| 601 |
+
|
| 602 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
| 603 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
| 604 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
| 605 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
| 606 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
| 607 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
| 608 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
| 609 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
| 610 |
+
SUCH DAMAGES.
|
| 611 |
+
|
| 612 |
+
17. Interpretation of Sections 15 and 16.
|
| 613 |
+
|
| 614 |
+
If the disclaimer of warranty and limitation of liability provided
|
| 615 |
+
above cannot be given local legal effect according to their terms,
|
| 616 |
+
reviewing courts shall apply local law that most closely approximates
|
| 617 |
+
an absolute waiver of all civil liability in connection with the
|
| 618 |
+
Program, unless a warranty or assumption of liability accompanies a
|
| 619 |
+
copy of the Program in return for a fee.
|
| 620 |
+
|
| 621 |
+
END OF TERMS AND CONDITIONS
|
| 622 |
+
|
| 623 |
+
How to Apply These Terms to Your New Programs
|
| 624 |
+
|
| 625 |
+
If you develop a new program, and you want it to be of the greatest
|
| 626 |
+
possible use to the public, the best way to achieve this is to make it
|
| 627 |
+
free software which everyone can redistribute and change under these terms.
|
| 628 |
+
|
| 629 |
+
To do so, attach the following notices to the program. It is safest
|
| 630 |
+
to attach them to the start of each source file to most effectively
|
| 631 |
+
state the exclusion of warranty; and each file should have at least
|
| 632 |
+
the "copyright" line and a pointer to where the full notice is found.
|
| 633 |
+
|
| 634 |
+
<one line to give the program's name and a brief idea of what it does.>
|
| 635 |
+
Copyright (C) <year> <name of author>
|
| 636 |
+
|
| 637 |
+
This program is free software: you can redistribute it and/or modify
|
| 638 |
+
it under the terms of the GNU General Public License as published by
|
| 639 |
+
the Free Software Foundation, either version 3 of the License, or
|
| 640 |
+
(at your option) any later version.
|
| 641 |
+
|
| 642 |
+
This program is distributed in the hope that it will be useful,
|
| 643 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 644 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 645 |
+
GNU General Public License for more details.
|
| 646 |
+
|
| 647 |
+
You should have received a copy of the GNU General Public License
|
| 648 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
| 649 |
+
|
| 650 |
+
Also add information on how to contact you by electronic and paper mail.
|
| 651 |
+
|
| 652 |
+
If the program does terminal interaction, make it output a short
|
| 653 |
+
notice like this when it starts in an interactive mode:
|
| 654 |
+
|
| 655 |
+
<program> Copyright (C) <year> <name of author>
|
| 656 |
+
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
| 657 |
+
This is free software, and you are welcome to redistribute it
|
| 658 |
+
under certain conditions; type `show c' for details.
|
| 659 |
+
|
| 660 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
| 661 |
+
parts of the General Public License. Of course, your program's commands
|
| 662 |
+
might be different; for a GUI interface, you would use an "about box".
|
| 663 |
+
|
| 664 |
+
You should also get your employer (if you work as a programmer) or school,
|
| 665 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
| 666 |
+
For more information on this, and how to apply and follow the GNU GPL, see
|
| 667 |
+
<https://www.gnu.org/licenses/>.
|
| 668 |
+
|
| 669 |
+
The GNU General Public License does not permit incorporating your program
|
| 670 |
+
into proprietary programs. If your program is a subroutine library, you
|
| 671 |
+
may consider it more useful to permit linking proprietary applications with
|
| 672 |
+
the library. If this is what you want to do, use the GNU Lesser General
|
| 673 |
+
Public License instead of this License. But first, please read
|
| 674 |
+
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
models/icpr2020dfdc/README.md
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Video Face Manipulation Detection Through Ensemble of CNNs
|
| 2 |
+
[](https://paperswithcode.com/sota/deepfake-detection-on-dfdc?p=video-face-manipulation-detection-through)
|
| 3 |
+
[](https://paperswithcode.com/sota/deepfake-detection-on-faceforensics-1?p=video-face-manipulation-detection-through)
|
| 4 |
+
[](https://travis-ci.org/polimi-ispl/icpr2020dfdc)
|
| 5 |
+
|
| 6 |
+

|
| 7 |
+
|
| 8 |
+
<p align='center'>
|
| 9 |
+
<img src='assets/mqzvfufzoq_face.gif'/>
|
| 10 |
+
<img src='assets/mqzvfufzoq_face_att.gif'/>
|
| 11 |
+
</p>
|
| 12 |
+
|
| 13 |
+
This is the official repository of **Video Face Manipulation Detection Through Ensemble of CNNs**,
|
| 14 |
+
presented at [ICPR2020](https://www.micc.unifi.it/icpr2020/) and currently available on [IEEExplore](https://ieeexplore.ieee.org/document/9412711) and [arXiv](https://arxiv.org/abs/2004.07676).
|
| 15 |
+
If you use this repository for your research, please consider citing our paper. Refer to [How to cite](https://github.com/polimi-ispl/icpr2020dfdc#how-to-cite) section to get the correct entry for your bibliography.
|
| 16 |
+
|
| 17 |
+
We participated as the **ISPL** team in the [Kaggle Deepfake Detection Challenge](https://www.kaggle.com/c/deepfake-detection-challenge/).
|
| 18 |
+
With this implementation, we reached the 41st position over 2116 teams (**top 2%**) on the [private leaderboard](https://www.kaggle.com/c/deepfake-detection-challenge/leaderboard).
|
| 19 |
+
|
| 20 |
+
This repository is currently under maintenance, if you are experiencing any problems, please open an [issue](https://github.com/polimi-ispl/icpr2020dfdc/issues).
|
| 21 |
+
## Getting started
|
| 22 |
+
|
| 23 |
+
### Prerequisites
|
| 24 |
+
- Install [conda](https://docs.conda.io/en/latest/miniconda.html)
|
| 25 |
+
- Create the `icpr2020` environment with *environment.yml*
|
| 26 |
+
```bash
|
| 27 |
+
$ conda env create -f environment.yml
|
| 28 |
+
$ conda activate icpr2020
|
| 29 |
+
```
|
| 30 |
+
- Download and unzip the [datasets](#datasets)
|
| 31 |
+
|
| 32 |
+
### Quick run
|
| 33 |
+
If you just want to test the pre-trained models against your own videos or images:
|
| 34 |
+
- [Video prediction notebook](https://github.com/polimi-ispl/icpr2020dfdc/blob/master/notebook/Video%20prediction.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/12WnvmerHBNbJ49HdoH1lli_O8SwaFPjv?usp=sharing">
|
| 35 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg">
|
| 36 |
+
</a>
|
| 37 |
+
|
| 38 |
+
- [Image prediction notebook](https://github.com/polimi-ispl/icpr2020dfdc/blob/master/notebook/Image%20prediction.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/19oVKlzEr58VZfRnSq-nW8kFYuxkh3GM8?usp=sharing">
|
| 39 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg">
|
| 40 |
+
</a>
|
| 41 |
+
|
| 42 |
+
- [Image prediction with attention](notebook/Image%20prediction%20and%20attention.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/1zcglis2Qx2vtJhrogn8aKA-mbUotLZLK?usp=sharing">
|
| 43 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg">
|
| 44 |
+
</a>
|
| 45 |
+
|
| 46 |
+
### The whole pipeline
|
| 47 |
+
You need to preprocess the datasets in order to index all the samples and extract faces. Just run the script [make_dataset.sh](scripts/make_dataset.sh)
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
$ ./scripts/make_dataset.sh
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
Please note that we use only 32 frames per video. You can easily tweak this parameter in [extract_faces.py](extract_faces.py)
|
| 54 |
+
Also, please note that **for the DFDC** we have resorted to _the training split_ exclusively!
|
| 55 |
+
In `scripts/make_dataset.sh` the value of `DFDC_SRC` should point to the directory containing the DFDC train split.
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
### Celeb-DF (v2)
|
| 59 |
+
Altough **we did not use this dataset in the paper**, we provide a script [index_celebdf.py](index_celebdf.py) to index the videos similarly to
|
| 60 |
+
DFDC and FF++. Once you have the index, you can proceed with the pipeline starting from [extract_faces.py](extract_faces.py). You can also use the
|
| 61 |
+
split `celebdf` during training/testing.
|
| 62 |
+
|
| 63 |
+
### Train
|
| 64 |
+
In [train_all.sh](scripts/train_all.sh) you can find a comprehensive list of all the commands to train the models presented in the paper.
|
| 65 |
+
Please refer to the comments in the script for hints on their usage.
|
| 66 |
+
|
| 67 |
+
#### Training a single model
|
| 68 |
+
If you want to train some models without lunching the script:
|
| 69 |
+
- for the **non-siamese** architectures (e.g. EfficientNetB4, EfficientNetB4Att), you can simply specify the model in [train_binclass.py](train_binclass.py) with the *--net* parameter;
|
| 70 |
+
- for the **siamese** architectures (e.g. EfficientNetB4ST, EfficientNetB4AttST), you have to:
|
| 71 |
+
1. train the architecture as a feature extractor first, using the [train_triplet.py](train_triplet.py) script and being careful of specifying its name with the *--net* parameter **without** the ST suffix. For instance, for training the EfficientNetB4ST you will have to first run `python train_triplet.py --net EfficientNetB4 --otherparams`;
|
| 72 |
+
2. finetune the model using [train_binclass.py](train_binclass.py), being careful this time to specify the architecture's name **with** the ST suffix and to insert as *--init* argument the path to the weights of the feature extractor trained at the previous step. You will end up running something like `python train_binclass.py --net EfficientNetB4ST --init path/to/EfficientNetB4/weights/trained/with/train_triplet/weights.pth --otherparams`
|
| 73 |
+
|
| 74 |
+
### Test
|
| 75 |
+
In [test_all.sh](scripts/test_all.sh) you can find a comprehensive list of all the commands for testing the models presented in the paper.
|
| 76 |
+
|
| 77 |
+
#### Pretrained weights
|
| 78 |
+
We also provide pretrained weights for all the architectures presented in the paper.
|
| 79 |
+
Please refer to this [Dropbox link](https://www.dropbox.com/sh/cesamx5ytd5j08c/AADG_eEmhskliMaT0Gbk-yHDa?dl=0).
|
| 80 |
+
Each directory is named `$NETWORK_$DATASET` where `$NETWORK` is the architecture name and `$DATASET` is the training dataset.
|
| 81 |
+
In each directory, you can find `bestval.pth` which are the best network weights according to the validation set.
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
Additionally, you can find Jupyter notebooks for results computations in the [notebook](notebook) folder.
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
## Datasets
|
| 88 |
+
- [Facebook's DeepFake Detection Challenge (DFDC) train dataset](https://www.kaggle.com/c/deepfake-detection-challenge/data) | [arXiv paper](https://arxiv.org/abs/2006.07397)
|
| 89 |
+
- [FaceForensics++](https://github.com/ondyari/FaceForensics/blob/master/dataset/README.md) | [arXiv paper](https://arxiv.org/abs/1901.08971)
|
| 90 |
+
- [Celeb-DF (v2)](http://www.cs.albany.edu/~lsw/celeb-deepfakeforensics.html) | [arXiv paper](https://arxiv.org/abs/1909.12962) (**Just for reference, not used in the paper**)
|
| 91 |
+
|
| 92 |
+
## References
|
| 93 |
+
- [EfficientNet PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch)
|
| 94 |
+
- [Xception PyTorch](https://github.com/tstandley/Xception-PyTorch)
|
| 95 |
+
|
| 96 |
+
## How to cite
|
| 97 |
+
Plain text:
|
| 98 |
+
```
|
| 99 |
+
N. Bonettini, E. D. Cannas, S. Mandelli, L. Bondi, P. Bestagini and S. Tubaro, "Video Face Manipulation Detection Through Ensemble of CNNs," 2020 25th International Conference on Pattern Recognition (ICPR), 2021, pp. 5012-5019, doi: 10.1109/ICPR48806.2021.9412711.
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
Bibtex:
|
| 103 |
+
```bibtex
|
| 104 |
+
@INPROCEEDINGS{9412711,
|
| 105 |
+
author={Bonettini, Nicolò and Cannas, Edoardo Daniele and Mandelli, Sara and Bondi, Luca and Bestagini, Paolo and Tubaro, Stefano},
|
| 106 |
+
booktitle={2020 25th International Conference on Pattern Recognition (ICPR)},
|
| 107 |
+
title={Video Face Manipulation Detection Through Ensemble of CNNs},
|
| 108 |
+
year={2021},
|
| 109 |
+
volume={},
|
| 110 |
+
number={},
|
| 111 |
+
pages={5012-5019},
|
| 112 |
+
doi={10.1109/ICPR48806.2021.9412711}}
|
| 113 |
+
```
|
| 114 |
+
## Credits
|
| 115 |
+
[Image and Sound Processing Lab - Politecnico di Milano](http://ispl.deib.polimi.it/)
|
| 116 |
+
- Nicolò Bonettini
|
| 117 |
+
- Edoardo Daniele Cannas
|
| 118 |
+
- Sara Mandelli
|
| 119 |
+
- Luca Bondi
|
| 120 |
+
- Paolo Bestagini
|
models/icpr2020dfdc/architectures/__init__.py
ADDED
|
File without changes
|
models/icpr2020dfdc/architectures/externals/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .xception import xception
|
models/icpr2020dfdc/architectures/externals/xception.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch)
|
| 3 |
+
|
| 4 |
+
@author: tstandley
|
| 5 |
+
Adapted by cadene
|
| 6 |
+
|
| 7 |
+
Creates an Xception Model as defined in:
|
| 8 |
+
|
| 9 |
+
Francois Chollet
|
| 10 |
+
Xception: Deep Learning with Depthwise Separable Convolutions
|
| 11 |
+
https://arxiv.org/pdf/1610.02357.pdf
|
| 12 |
+
|
| 13 |
+
This weights ported from the Keras implementation. Achieves the following performance on the validation set:
|
| 14 |
+
|
| 15 |
+
Loss:0.9173 Prec@1:78.892 Prec@5:94.292
|
| 16 |
+
|
| 17 |
+
REMEMBER to set your image size to 3x299x299 for both test and validation
|
| 18 |
+
|
| 19 |
+
normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
|
| 20 |
+
std=[0.5, 0.5, 0.5])
|
| 21 |
+
|
| 22 |
+
The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
|
| 23 |
+
"""
|
| 24 |
+
from __future__ import print_function, division, absolute_import
|
| 25 |
+
|
| 26 |
+
import torch.nn as nn
|
| 27 |
+
import torch.nn.functional as F
|
| 28 |
+
import torch.utils.model_zoo as model_zoo
|
| 29 |
+
|
| 30 |
+
__all__ = ['xception']
|
| 31 |
+
|
| 32 |
+
pretrained_settings = {
|
| 33 |
+
'xception': {
|
| 34 |
+
'imagenet': {
|
| 35 |
+
'url': 'http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth',
|
| 36 |
+
'input_space': 'RGB',
|
| 37 |
+
'input_size': [3, 299, 299],
|
| 38 |
+
'input_range': [0, 1],
|
| 39 |
+
'mean': [0.5, 0.5, 0.5],
|
| 40 |
+
'std': [0.5, 0.5, 0.5],
|
| 41 |
+
'num_classes': 1000,
|
| 42 |
+
'scale': 0.8975
|
| 43 |
+
# The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class SeparableConv2d(nn.Module):
|
| 50 |
+
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
|
| 51 |
+
super(SeparableConv2d, self).__init__()
|
| 52 |
+
|
| 53 |
+
self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride, padding, dilation, groups=in_channels,
|
| 54 |
+
bias=bias)
|
| 55 |
+
self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=bias)
|
| 56 |
+
|
| 57 |
+
def forward(self, x):
|
| 58 |
+
x = self.conv1(x)
|
| 59 |
+
x = self.pointwise(x)
|
| 60 |
+
return x
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class Block(nn.Module):
|
| 64 |
+
def __init__(self, in_filters, out_filters, reps, strides=1, start_with_relu=True, grow_first=True):
|
| 65 |
+
super(Block, self).__init__()
|
| 66 |
+
|
| 67 |
+
if out_filters != in_filters or strides != 1:
|
| 68 |
+
self.skip = nn.Conv2d(in_filters, out_filters, 1, stride=strides, bias=False)
|
| 69 |
+
self.skipbn = nn.BatchNorm2d(out_filters)
|
| 70 |
+
else:
|
| 71 |
+
self.skip = None
|
| 72 |
+
|
| 73 |
+
rep = []
|
| 74 |
+
|
| 75 |
+
filters = in_filters
|
| 76 |
+
if grow_first:
|
| 77 |
+
rep.append(nn.ReLU(inplace=True))
|
| 78 |
+
rep.append(SeparableConv2d(in_filters, out_filters, 3, stride=1, padding=1, bias=False))
|
| 79 |
+
rep.append(nn.BatchNorm2d(out_filters))
|
| 80 |
+
filters = out_filters
|
| 81 |
+
|
| 82 |
+
for i in range(reps - 1):
|
| 83 |
+
rep.append(nn.ReLU(inplace=True))
|
| 84 |
+
rep.append(SeparableConv2d(filters, filters, 3, stride=1, padding=1, bias=False))
|
| 85 |
+
rep.append(nn.BatchNorm2d(filters))
|
| 86 |
+
|
| 87 |
+
if not grow_first:
|
| 88 |
+
rep.append(nn.ReLU(inplace=True))
|
| 89 |
+
rep.append(SeparableConv2d(in_filters, out_filters, 3, stride=1, padding=1, bias=False))
|
| 90 |
+
rep.append(nn.BatchNorm2d(out_filters))
|
| 91 |
+
|
| 92 |
+
if not start_with_relu:
|
| 93 |
+
rep = rep[1:]
|
| 94 |
+
else:
|
| 95 |
+
rep[0] = nn.ReLU(inplace=False)
|
| 96 |
+
|
| 97 |
+
if strides != 1:
|
| 98 |
+
rep.append(nn.MaxPool2d(3, strides, 1))
|
| 99 |
+
self.rep = nn.Sequential(*rep)
|
| 100 |
+
|
| 101 |
+
def forward(self, inp):
|
| 102 |
+
x = self.rep(inp)
|
| 103 |
+
|
| 104 |
+
if self.skip is not None:
|
| 105 |
+
skip = self.skip(inp)
|
| 106 |
+
skip = self.skipbn(skip)
|
| 107 |
+
else:
|
| 108 |
+
skip = inp
|
| 109 |
+
|
| 110 |
+
x += skip
|
| 111 |
+
return x
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class Xception(nn.Module):
|
| 115 |
+
"""
|
| 116 |
+
Xception optimized for the ImageNet dataset, as specified in
|
| 117 |
+
https://arxiv.org/pdf/1610.02357.pdf
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
def __init__(self, num_classes=1000):
|
| 121 |
+
""" Constructor
|
| 122 |
+
Args:
|
| 123 |
+
num_classes: number of classes
|
| 124 |
+
"""
|
| 125 |
+
super(Xception, self).__init__()
|
| 126 |
+
self.num_classes = num_classes
|
| 127 |
+
|
| 128 |
+
self.conv1 = nn.Conv2d(3, 32, 3, 2, 0, bias=False)
|
| 129 |
+
self.bn1 = nn.BatchNorm2d(32)
|
| 130 |
+
self.relu1 = nn.ReLU(inplace=True)
|
| 131 |
+
|
| 132 |
+
self.conv2 = nn.Conv2d(32, 64, 3, bias=False)
|
| 133 |
+
self.bn2 = nn.BatchNorm2d(64)
|
| 134 |
+
self.relu2 = nn.ReLU(inplace=True)
|
| 135 |
+
# do relu here
|
| 136 |
+
|
| 137 |
+
self.block1 = Block(64, 128, 2, 2, start_with_relu=False, grow_first=True)
|
| 138 |
+
self.block2 = Block(128, 256, 2, 2, start_with_relu=True, grow_first=True)
|
| 139 |
+
self.block3 = Block(256, 728, 2, 2, start_with_relu=True, grow_first=True)
|
| 140 |
+
|
| 141 |
+
self.block4 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
|
| 142 |
+
self.block5 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
|
| 143 |
+
self.block6 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
|
| 144 |
+
self.block7 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
|
| 145 |
+
|
| 146 |
+
self.block8 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
|
| 147 |
+
self.block9 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
|
| 148 |
+
self.block10 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
|
| 149 |
+
self.block11 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
|
| 150 |
+
|
| 151 |
+
self.block12 = Block(728, 1024, 2, 2, start_with_relu=True, grow_first=False)
|
| 152 |
+
|
| 153 |
+
self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1)
|
| 154 |
+
self.bn3 = nn.BatchNorm2d(1536)
|
| 155 |
+
self.relu3 = nn.ReLU(inplace=True)
|
| 156 |
+
|
| 157 |
+
# do relu here
|
| 158 |
+
self.conv4 = SeparableConv2d(1536, 2048, 3, 1, 1)
|
| 159 |
+
self.bn4 = nn.BatchNorm2d(2048)
|
| 160 |
+
|
| 161 |
+
self.fc = nn.Linear(2048, num_classes)
|
| 162 |
+
|
| 163 |
+
# #------- init weights --------
|
| 164 |
+
# for m in self.modules():
|
| 165 |
+
# if isinstance(m, nn.Conv2d):
|
| 166 |
+
# n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
| 167 |
+
# m.weight.data.normal_(0, math.sqrt(2. / n))
|
| 168 |
+
# elif isinstance(m, nn.BatchNorm2d):
|
| 169 |
+
# m.weight.data.fill_(1)
|
| 170 |
+
# m.bias.data.zero_()
|
| 171 |
+
# #-----------------------------
|
| 172 |
+
|
| 173 |
+
def features(self, input):
|
| 174 |
+
x = self.conv1(input)
|
| 175 |
+
x = self.bn1(x)
|
| 176 |
+
x = self.relu1(x)
|
| 177 |
+
|
| 178 |
+
x = self.conv2(x)
|
| 179 |
+
x = self.bn2(x)
|
| 180 |
+
x = self.relu2(x)
|
| 181 |
+
|
| 182 |
+
x = self.block1(x)
|
| 183 |
+
x = self.block2(x)
|
| 184 |
+
x = self.block3(x)
|
| 185 |
+
x = self.block4(x)
|
| 186 |
+
x = self.block5(x)
|
| 187 |
+
x = self.block6(x)
|
| 188 |
+
x = self.block7(x)
|
| 189 |
+
x = self.block8(x)
|
| 190 |
+
x = self.block9(x)
|
| 191 |
+
x = self.block10(x)
|
| 192 |
+
x = self.block11(x)
|
| 193 |
+
x = self.block12(x)
|
| 194 |
+
|
| 195 |
+
x = self.conv3(x)
|
| 196 |
+
x = self.bn3(x)
|
| 197 |
+
x = self.relu3(x)
|
| 198 |
+
|
| 199 |
+
x = self.conv4(x)
|
| 200 |
+
x = self.bn4(x)
|
| 201 |
+
return x
|
| 202 |
+
|
| 203 |
+
def logits(self, features):
|
| 204 |
+
x = nn.ReLU(inplace=True)(features)
|
| 205 |
+
|
| 206 |
+
x = F.adaptive_avg_pool2d(x, (1, 1))
|
| 207 |
+
x = x.view(x.size(0), -1)
|
| 208 |
+
x = self.last_linear(x)
|
| 209 |
+
return x
|
| 210 |
+
|
| 211 |
+
def forward(self, input):
|
| 212 |
+
x = self.features(input)
|
| 213 |
+
x = self.logits(x)
|
| 214 |
+
return x
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def xception(num_classes=1000, pretrained='imagenet'):
|
| 218 |
+
model = Xception(num_classes=num_classes)
|
| 219 |
+
if pretrained:
|
| 220 |
+
settings = pretrained_settings['xception'][pretrained]
|
| 221 |
+
assert num_classes == settings['num_classes'], \
|
| 222 |
+
"num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
|
| 223 |
+
|
| 224 |
+
model = Xception(num_classes=num_classes)
|
| 225 |
+
model.load_state_dict(model_zoo.load_url(settings['url']))
|
| 226 |
+
|
| 227 |
+
model.input_space = settings['input_space']
|
| 228 |
+
model.input_size = settings['input_size']
|
| 229 |
+
model.input_range = settings['input_range']
|
| 230 |
+
model.mean = settings['mean']
|
| 231 |
+
model.std = settings['std']
|
| 232 |
+
|
| 233 |
+
# TODO: ugly
|
| 234 |
+
model.last_linear = model.fc
|
| 235 |
+
del model.fc
|
| 236 |
+
return model
|
models/icpr2020dfdc/architectures/fornet.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
from collections import OrderedDict
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
from efficientnet_pytorch import EfficientNet
|
| 16 |
+
from torch import nn as nn
|
| 17 |
+
from torch.nn import functional as F
|
| 18 |
+
from torchvision import transforms
|
| 19 |
+
|
| 20 |
+
from . import externals
|
| 21 |
+
|
| 22 |
+
"""
|
| 23 |
+
Feature Extractor
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class FeatureExtractor(nn.Module):
|
| 28 |
+
"""
|
| 29 |
+
Abstract class to be extended when supporting features extraction.
|
| 30 |
+
It also provides standard normalized and parameters
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def features(self, x: torch.Tensor) -> torch.Tensor:
|
| 34 |
+
raise NotImplementedError
|
| 35 |
+
|
| 36 |
+
def get_trainable_parameters(self):
|
| 37 |
+
return self.parameters()
|
| 38 |
+
|
| 39 |
+
@staticmethod
|
| 40 |
+
def get_normalizer():
|
| 41 |
+
return transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
"""
|
| 45 |
+
EfficientNet
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class EfficientNetGen(FeatureExtractor):
|
| 50 |
+
def __init__(self, model: str):
|
| 51 |
+
super(EfficientNetGen, self).__init__()
|
| 52 |
+
|
| 53 |
+
self.efficientnet = EfficientNet.from_pretrained(model)
|
| 54 |
+
self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
|
| 55 |
+
del self.efficientnet._fc
|
| 56 |
+
|
| 57 |
+
def features(self, x: torch.Tensor) -> torch.Tensor:
|
| 58 |
+
x = self.efficientnet.extract_features(x)
|
| 59 |
+
x = self.efficientnet._avg_pooling(x)
|
| 60 |
+
x = x.flatten(start_dim=1)
|
| 61 |
+
return x
|
| 62 |
+
|
| 63 |
+
def forward(self, x):
|
| 64 |
+
x = self.features(x)
|
| 65 |
+
x = self.efficientnet._dropout(x)
|
| 66 |
+
x = self.classifier(x)
|
| 67 |
+
return x
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class EfficientNetB4(EfficientNetGen):
|
| 71 |
+
def __init__(self):
|
| 72 |
+
super(EfficientNetB4, self).__init__(model='efficientnet-b4')
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
"""
|
| 76 |
+
EfficientNetAutoAtt
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class EfficientNetAutoAtt(EfficientNet):
|
| 81 |
+
def init_att(self, model: str, width: int):
|
| 82 |
+
"""
|
| 83 |
+
Initialize attention
|
| 84 |
+
:param model: efficientnet-bx, x \in {0,..,7}
|
| 85 |
+
:param depth: attention width
|
| 86 |
+
:return:
|
| 87 |
+
"""
|
| 88 |
+
if model == 'efficientnet-b4':
|
| 89 |
+
self.att_block_idx = 9
|
| 90 |
+
if width == 0:
|
| 91 |
+
self.attconv = nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)
|
| 92 |
+
else:
|
| 93 |
+
attconv_layers = []
|
| 94 |
+
for i in range(width):
|
| 95 |
+
attconv_layers.append(
|
| 96 |
+
('conv{:d}'.format(i), nn.Conv2d(kernel_size=3, padding=1, in_channels=56, out_channels=56)))
|
| 97 |
+
attconv_layers.append(
|
| 98 |
+
('relu{:d}'.format(i), nn.ReLU(inplace=True)))
|
| 99 |
+
attconv_layers.append(('conv_out', nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)))
|
| 100 |
+
self.attconv = nn.Sequential(OrderedDict(attconv_layers))
|
| 101 |
+
else:
|
| 102 |
+
raise ValueError('Model not valid: {}'.format(model))
|
| 103 |
+
|
| 104 |
+
def get_attention(self, x: torch.Tensor) -> torch.Tensor:
|
| 105 |
+
|
| 106 |
+
# Placeholder
|
| 107 |
+
att = None
|
| 108 |
+
|
| 109 |
+
# Stem
|
| 110 |
+
x = self._swish(self._bn0(self._conv_stem(x)))
|
| 111 |
+
|
| 112 |
+
# Blocks
|
| 113 |
+
for idx, block in enumerate(self._blocks):
|
| 114 |
+
drop_connect_rate = self._global_params.drop_connect_rate
|
| 115 |
+
if drop_connect_rate:
|
| 116 |
+
drop_connect_rate *= float(idx) / len(self._blocks)
|
| 117 |
+
x = block(x, drop_connect_rate=drop_connect_rate)
|
| 118 |
+
if idx == self.att_block_idx:
|
| 119 |
+
att = torch.sigmoid(self.attconv(x))
|
| 120 |
+
break
|
| 121 |
+
|
| 122 |
+
return att
|
| 123 |
+
|
| 124 |
+
def extract_features(self, x: torch.Tensor) -> torch.Tensor:
|
| 125 |
+
# Stem
|
| 126 |
+
x = self._swish(self._bn0(self._conv_stem(x)))
|
| 127 |
+
|
| 128 |
+
# Blocks
|
| 129 |
+
for idx, block in enumerate(self._blocks):
|
| 130 |
+
drop_connect_rate = self._global_params.drop_connect_rate
|
| 131 |
+
if drop_connect_rate:
|
| 132 |
+
drop_connect_rate *= float(idx) / len(self._blocks)
|
| 133 |
+
x = block(x, drop_connect_rate=drop_connect_rate)
|
| 134 |
+
if idx == self.att_block_idx:
|
| 135 |
+
att = torch.sigmoid(self.attconv(x))
|
| 136 |
+
x = x * att
|
| 137 |
+
|
| 138 |
+
# Head
|
| 139 |
+
x = self._swish(self._bn1(self._conv_head(x)))
|
| 140 |
+
|
| 141 |
+
return x
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class EfficientNetGenAutoAtt(FeatureExtractor):
|
| 145 |
+
def __init__(self, model: str, width: int):
|
| 146 |
+
super(EfficientNetGenAutoAtt, self).__init__()
|
| 147 |
+
|
| 148 |
+
self.efficientnet = EfficientNetAutoAtt.from_pretrained(model)
|
| 149 |
+
self.efficientnet.init_att(model, width)
|
| 150 |
+
self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
|
| 151 |
+
del self.efficientnet._fc
|
| 152 |
+
|
| 153 |
+
def features(self, x: torch.Tensor) -> torch.Tensor:
|
| 154 |
+
x = self.efficientnet.extract_features(x)
|
| 155 |
+
x = self.efficientnet._avg_pooling(x)
|
| 156 |
+
x = x.flatten(start_dim=1)
|
| 157 |
+
return x
|
| 158 |
+
|
| 159 |
+
def forward(self, x):
|
| 160 |
+
x = self.features(x)
|
| 161 |
+
x = self.efficientnet._dropout(x)
|
| 162 |
+
x = self.classifier(x)
|
| 163 |
+
return x
|
| 164 |
+
|
| 165 |
+
def get_attention(self, x: torch.Tensor) -> torch.Tensor:
|
| 166 |
+
return self.efficientnet.get_attention(x)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
class EfficientNetAutoAttB4(EfficientNetGenAutoAtt):
|
| 170 |
+
def __init__(self):
|
| 171 |
+
super(EfficientNetAutoAttB4, self).__init__(model='efficientnet-b4', width=0)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
"""
|
| 175 |
+
Xception
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
class Xception(FeatureExtractor):
|
| 180 |
+
def __init__(self):
|
| 181 |
+
super(Xception, self).__init__()
|
| 182 |
+
self.xception = externals.xception()
|
| 183 |
+
self.xception.last_linear = nn.Linear(2048, 1)
|
| 184 |
+
|
| 185 |
+
def features(self, x: torch.Tensor) -> torch.Tensor:
|
| 186 |
+
x = self.xception.features(x)
|
| 187 |
+
x = nn.ReLU(inplace=True)(x)
|
| 188 |
+
x = F.adaptive_avg_pool2d(x, (1, 1))
|
| 189 |
+
x = x.view(x.size(0), -1)
|
| 190 |
+
return x
|
| 191 |
+
|
| 192 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 193 |
+
return self.xception.forward(x)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
"""
|
| 197 |
+
Siamese tuning
|
| 198 |
+
"""
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
class SiameseTuning(FeatureExtractor):
|
| 202 |
+
def __init__(self, feat_ext: FeatureExtractor, num_feat: int, lastonly: bool = True):
|
| 203 |
+
super(SiameseTuning, self).__init__()
|
| 204 |
+
self.feat_ext = feat_ext()
|
| 205 |
+
if not hasattr(self.feat_ext, 'features'):
|
| 206 |
+
raise NotImplementedError('The provided feature extractor needs to provide a features() method')
|
| 207 |
+
self.lastonly = lastonly
|
| 208 |
+
self.classifier = nn.Sequential(
|
| 209 |
+
nn.BatchNorm1d(num_features=num_feat),
|
| 210 |
+
nn.Linear(in_features=num_feat, out_features=1),
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
def features(self, x):
|
| 214 |
+
x = self.feat_ext.features(x)
|
| 215 |
+
return x
|
| 216 |
+
|
| 217 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 218 |
+
if self.lastonly:
|
| 219 |
+
with torch.no_grad():
|
| 220 |
+
x = self.features(x)
|
| 221 |
+
else:
|
| 222 |
+
x = self.features(x)
|
| 223 |
+
x = self.classifier(x)
|
| 224 |
+
return x
|
| 225 |
+
|
| 226 |
+
def get_trainable_parameters(self):
|
| 227 |
+
if self.lastonly:
|
| 228 |
+
return self.classifier.parameters()
|
| 229 |
+
else:
|
| 230 |
+
return self.parameters()
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
class EfficientNetB4ST(SiameseTuning):
|
| 234 |
+
def __init__(self):
|
| 235 |
+
super(EfficientNetB4ST, self).__init__(feat_ext=EfficientNetB4, num_feat=1792, lastonly=True)
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
class EfficientNetAutoAttB4ST(SiameseTuning):
|
| 239 |
+
def __init__(self):
|
| 240 |
+
super(EfficientNetAutoAttB4ST, self).__init__(feat_ext=EfficientNetAutoAttB4, num_feat=1792, lastonly=True)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
class XceptionST(SiameseTuning):
|
| 244 |
+
def __init__(self):
|
| 245 |
+
super(XceptionST, self).__init__(feat_ext=Xception, num_feat=2048, lastonly=True)
|
models/icpr2020dfdc/architectures/tripletnet.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
from . import fornet
|
| 13 |
+
from .fornet import FeatureExtractor
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class TripletNet(FeatureExtractor):
|
| 17 |
+
"""
|
| 18 |
+
Template class for triplet net
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, feat_ext: FeatureExtractor):
|
| 22 |
+
super(TripletNet, self).__init__()
|
| 23 |
+
self.feat_ext = feat_ext()
|
| 24 |
+
if not hasattr(self.feat_ext, 'features'):
|
| 25 |
+
raise NotImplementedError('The provided feature extractor needs to provide a features() method')
|
| 26 |
+
|
| 27 |
+
def features(self, x):
|
| 28 |
+
return self.feat_ext.features(x)
|
| 29 |
+
|
| 30 |
+
def forward(self, x1, x2, x3):
|
| 31 |
+
x1 = self.features(x1)
|
| 32 |
+
x2 = self.features(x2)
|
| 33 |
+
x3 = self.features(x3)
|
| 34 |
+
return x1, x2, x3
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class EfficientNetB4(TripletNet):
|
| 38 |
+
def __init__(self):
|
| 39 |
+
super(EfficientNetB4, self).__init__(feat_ext=fornet.EfficientNetB4)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class EfficientNetAutoAttB4(TripletNet):
|
| 43 |
+
def __init__(self):
|
| 44 |
+
super(EfficientNetAutoAttB4, self).__init__(feat_ext=fornet.EfficientNetAutoAttB4)
|
models/icpr2020dfdc/architectures/weights.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
weight_url = {
|
| 14 |
+
'EfficientNetAutoAttB4ST_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4ST_DFDC_bestval-4df0ef7d2f380a5955affa78c35d0942ac1cd65229510353b252737775515a33.pth',
|
| 15 |
+
'EfficientNetAutoAttB4ST_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4ST_FFPP_bestval-ddb357503b9b902e1b925c2550415604c4252b9b9ecafeb7369dc58cc16e9edd.pth',
|
| 16 |
+
'EfficientNetAutoAttB4_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4_DFDC_bestval-72ed969b2a395fffe11a0d5bf0a635e7260ba2588c28683630d97ff7153389fc.pth',
|
| 17 |
+
'EfficientNetAutoAttB4_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4_FFPP_bestval-b0c9e9522a7143cf119843e910234be5e30f77dc527b1b427cdffa5ce3bdbc25.pth',
|
| 18 |
+
'EfficientNetB4ST_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4ST_DFDC_bestval-86f0a0701b18694dfb5e7837bd09fa8e48a5146c193227edccf59f1b038181c6.pth',
|
| 19 |
+
'EfficientNetB4ST_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4ST_FFPP_bestval-ccd016668071be5bf5fff68e446d055441739ec7113fb1a6eee998f08396ae92.pth',
|
| 20 |
+
'EfficientNetB4_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4_DFDC_bestval-c9f3663e2116d3356d056a0ce6453e0fc412a8df68ebd0902f07104d9129a09a.pth',
|
| 21 |
+
'EfficientNetB4_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4_FFPP_bestval-93aaad84946829e793d1a67ed7e0309b535e2f2395acb4f8d16b92c0616ba8d7.pth',
|
| 22 |
+
'Xception_DFDC':'https://f002.backblazeb2.com/file/icpr2020/Xception_DFDC_bestval-e826cdb64d73ef491e6b8ff8fce0e1e1b7fc1d8e2715bc51a56280fff17596f9.pth',
|
| 23 |
+
'Xception_FFPP':'https://f002.backblazeb2.com/file/icpr2020/Xception_FFPP_bestval-bb119e4913cb8f816cd28a03f81f4c603d6351bf8e3f8e3eb99eebc923aecd22.pth',
|
| 24 |
+
}
|
models/icpr2020dfdc/blazeface/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .blazeface import BlazeFace
|
| 2 |
+
from .face_extract import FaceExtractor
|
| 3 |
+
from .read_video import VideoReader
|
models/icpr2020dfdc/blazeface/anchors.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
|
| 3 |
+
size 28800
|
models/icpr2020dfdc/blazeface/blazeface.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:54ecff653feaaaf1f7d44b6aff28fd2fc50e483a4e847563b6dd261369c43ba4
|
| 3 |
+
size 420224
|
models/icpr2020dfdc/blazeface/blazeface.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class BlazeBlock(nn.Module):
|
| 10 |
+
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
|
| 11 |
+
super(BlazeBlock, self).__init__()
|
| 12 |
+
|
| 13 |
+
self.stride = stride
|
| 14 |
+
self.channel_pad = out_channels - in_channels
|
| 15 |
+
|
| 16 |
+
# TFLite uses slightly different padding than PyTorch
|
| 17 |
+
# on the depthwise conv layer when the stride is 2.
|
| 18 |
+
if stride == 2:
|
| 19 |
+
self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
|
| 20 |
+
padding = 0
|
| 21 |
+
else:
|
| 22 |
+
padding = (kernel_size - 1) // 2
|
| 23 |
+
|
| 24 |
+
self.convs = nn.Sequential(
|
| 25 |
+
nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
|
| 26 |
+
kernel_size=kernel_size, stride=stride, padding=padding,
|
| 27 |
+
groups=in_channels, bias=True),
|
| 28 |
+
nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
|
| 29 |
+
kernel_size=1, stride=1, padding=0, bias=True),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
self.act = nn.ReLU(inplace=True)
|
| 33 |
+
|
| 34 |
+
def forward(self, x):
|
| 35 |
+
if self.stride == 2:
|
| 36 |
+
h = F.pad(x, (0, 2, 0, 2), "constant", 0)
|
| 37 |
+
x = self.max_pool(x)
|
| 38 |
+
else:
|
| 39 |
+
h = x
|
| 40 |
+
|
| 41 |
+
if self.channel_pad > 0:
|
| 42 |
+
x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
|
| 43 |
+
|
| 44 |
+
return self.act(self.convs(h) + x)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class BlazeFace(nn.Module):
|
| 48 |
+
"""The BlazeFace face detection model from MediaPipe.
|
| 49 |
+
|
| 50 |
+
The version from MediaPipe is simpler than the one in the paper;
|
| 51 |
+
it does not use the "double" BlazeBlocks.
|
| 52 |
+
|
| 53 |
+
Because we won't be training this model, it doesn't need to have
|
| 54 |
+
batchnorm layers. These have already been "folded" into the conv
|
| 55 |
+
weights by TFLite.
|
| 56 |
+
|
| 57 |
+
The conversion to PyTorch is fairly straightforward, but there are
|
| 58 |
+
some small differences between TFLite and PyTorch in how they handle
|
| 59 |
+
padding on conv layers with stride 2.
|
| 60 |
+
|
| 61 |
+
This version works on batches, while the MediaPipe version can only
|
| 62 |
+
handle a single image at a time.
|
| 63 |
+
|
| 64 |
+
Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
|
| 65 |
+
https://github.com/google/mediapipe/
|
| 66 |
+
"""
|
| 67 |
+
input_size = (128, 128)
|
| 68 |
+
|
| 69 |
+
detection_keys = [
|
| 70 |
+
'ymin', 'xmin', 'ymax', 'xmax',
|
| 71 |
+
'kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y',
|
| 72 |
+
'conf'
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
def __init__(self):
|
| 76 |
+
super(BlazeFace, self).__init__()
|
| 77 |
+
|
| 78 |
+
# These are the settings from the MediaPipe example graph
|
| 79 |
+
# mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
|
| 80 |
+
self.num_classes = 1
|
| 81 |
+
self.num_anchors = 896
|
| 82 |
+
self.num_coords = 16
|
| 83 |
+
self.score_clipping_thresh = 100.0
|
| 84 |
+
self.x_scale = 128.0
|
| 85 |
+
self.y_scale = 128.0
|
| 86 |
+
self.h_scale = 128.0
|
| 87 |
+
self.w_scale = 128.0
|
| 88 |
+
self.min_score_thresh = 0.75
|
| 89 |
+
self.min_suppression_threshold = 0.3
|
| 90 |
+
|
| 91 |
+
self._define_layers()
|
| 92 |
+
|
| 93 |
+
def _define_layers(self):
|
| 94 |
+
self.backbone1 = nn.Sequential(
|
| 95 |
+
nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
|
| 96 |
+
nn.ReLU(inplace=True),
|
| 97 |
+
|
| 98 |
+
BlazeBlock(24, 24),
|
| 99 |
+
BlazeBlock(24, 28),
|
| 100 |
+
BlazeBlock(28, 32, stride=2),
|
| 101 |
+
BlazeBlock(32, 36),
|
| 102 |
+
BlazeBlock(36, 42),
|
| 103 |
+
BlazeBlock(42, 48, stride=2),
|
| 104 |
+
BlazeBlock(48, 56),
|
| 105 |
+
BlazeBlock(56, 64),
|
| 106 |
+
BlazeBlock(64, 72),
|
| 107 |
+
BlazeBlock(72, 80),
|
| 108 |
+
BlazeBlock(80, 88),
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
self.backbone2 = nn.Sequential(
|
| 112 |
+
BlazeBlock(88, 96, stride=2),
|
| 113 |
+
BlazeBlock(96, 96),
|
| 114 |
+
BlazeBlock(96, 96),
|
| 115 |
+
BlazeBlock(96, 96),
|
| 116 |
+
BlazeBlock(96, 96),
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
|
| 120 |
+
self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
|
| 121 |
+
|
| 122 |
+
self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
|
| 123 |
+
self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
|
| 124 |
+
|
| 125 |
+
def forward(self, x):
|
| 126 |
+
# TFLite uses slightly different padding on the first conv layer
|
| 127 |
+
# than PyTorch, so do it manually.
|
| 128 |
+
x = F.pad(x, (1, 2, 1, 2), "constant", 0)
|
| 129 |
+
|
| 130 |
+
b = x.shape[0] # batch size, needed for reshaping later
|
| 131 |
+
|
| 132 |
+
x = self.backbone1(x) # (b, 88, 16, 16)
|
| 133 |
+
h = self.backbone2(x) # (b, 96, 8, 8)
|
| 134 |
+
|
| 135 |
+
# Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
|
| 136 |
+
# permute the output from the conv layers before reshaping it.
|
| 137 |
+
|
| 138 |
+
c1 = self.classifier_8(x) # (b, 2, 16, 16)
|
| 139 |
+
c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2)
|
| 140 |
+
c1 = c1.reshape(b, -1, 1) # (b, 512, 1)
|
| 141 |
+
|
| 142 |
+
c2 = self.classifier_16(h) # (b, 6, 8, 8)
|
| 143 |
+
c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6)
|
| 144 |
+
c2 = c2.reshape(b, -1, 1) # (b, 384, 1)
|
| 145 |
+
|
| 146 |
+
c = torch.cat((c1, c2), dim=1) # (b, 896, 1)
|
| 147 |
+
|
| 148 |
+
r1 = self.regressor_8(x) # (b, 32, 16, 16)
|
| 149 |
+
r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32)
|
| 150 |
+
r1 = r1.reshape(b, -1, 16) # (b, 512, 16)
|
| 151 |
+
|
| 152 |
+
r2 = self.regressor_16(h) # (b, 96, 8, 8)
|
| 153 |
+
r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96)
|
| 154 |
+
r2 = r2.reshape(b, -1, 16) # (b, 384, 16)
|
| 155 |
+
|
| 156 |
+
r = torch.cat((r1, r2), dim=1) # (b, 896, 16)
|
| 157 |
+
return [r, c]
|
| 158 |
+
|
| 159 |
+
def _device(self):
|
| 160 |
+
"""Which device (CPU or GPU) is being used by this model?"""
|
| 161 |
+
return self.classifier_8.weight.device
|
| 162 |
+
|
| 163 |
+
def load_weights(self, path):
|
| 164 |
+
self.load_state_dict(torch.load(path))
|
| 165 |
+
self.eval()
|
| 166 |
+
|
| 167 |
+
def load_anchors(self, path):
|
| 168 |
+
self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
|
| 169 |
+
assert (self.anchors.ndimension() == 2)
|
| 170 |
+
assert (self.anchors.shape[0] == self.num_anchors)
|
| 171 |
+
assert (self.anchors.shape[1] == 4)
|
| 172 |
+
|
| 173 |
+
def _preprocess(self, x):
|
| 174 |
+
"""Converts the image pixels to the range [-1, 1]."""
|
| 175 |
+
return x.float() / 127.5 - 1.0
|
| 176 |
+
|
| 177 |
+
def predict_on_image(self, img):
|
| 178 |
+
"""Makes a prediction on a single image.
|
| 179 |
+
|
| 180 |
+
Arguments:
|
| 181 |
+
img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
|
| 182 |
+
shape (3, H, W). The image's height and width should be
|
| 183 |
+
128 pixels.
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
A tensor with face detections.
|
| 187 |
+
"""
|
| 188 |
+
if isinstance(img, np.ndarray):
|
| 189 |
+
img = torch.from_numpy(img).permute((2, 0, 1))
|
| 190 |
+
|
| 191 |
+
return self.predict_on_batch(img.unsqueeze(0))[0]
|
| 192 |
+
|
| 193 |
+
def predict_on_batch(self, x: np.ndarray or torch.Tensor, apply_nms: bool = True) -> List[torch.Tensor]:
|
| 194 |
+
"""Makes a prediction on a batch of images.
|
| 195 |
+
|
| 196 |
+
Arguments:
|
| 197 |
+
x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
|
| 198 |
+
shape (b, 3, H, W). The height and width should be 128 pixels.
|
| 199 |
+
apply_nms: pass False to not apply non-max suppression
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
A list containing a tensor of face detections for each image in
|
| 203 |
+
the batch. If no faces are found for an image, returns a tensor
|
| 204 |
+
of shape (0, 17).
|
| 205 |
+
|
| 206 |
+
Each face detection is a PyTorch tensor consisting of 17 numbers:
|
| 207 |
+
- ymin, xmin, ymax, xmax
|
| 208 |
+
- x,y-coordinates for the 6 keypoints
|
| 209 |
+
- confidence score
|
| 210 |
+
"""
|
| 211 |
+
if isinstance(x, np.ndarray):
|
| 212 |
+
x = torch.from_numpy(x).permute((0, 3, 1, 2))
|
| 213 |
+
|
| 214 |
+
assert x.shape[1] == 3
|
| 215 |
+
assert x.shape[2] == 128
|
| 216 |
+
assert x.shape[3] == 128
|
| 217 |
+
|
| 218 |
+
# 1. Preprocess the images into tensors:
|
| 219 |
+
x = x.to(self._device())
|
| 220 |
+
x = self._preprocess(x)
|
| 221 |
+
|
| 222 |
+
# 2. Run the neural network:
|
| 223 |
+
with torch.no_grad():
|
| 224 |
+
out: torch.Tensor = self.__call__(x)
|
| 225 |
+
|
| 226 |
+
# 3. Postprocess the raw predictions:
|
| 227 |
+
detections = self._tensors_to_detections(out[0], out[1], self.anchors)
|
| 228 |
+
|
| 229 |
+
# 4. Non-maximum suppression to remove overlapping detections:
|
| 230 |
+
return self.nms(detections) if apply_nms else detections
|
| 231 |
+
|
| 232 |
+
def nms(self, detections: List[torch.Tensor]) -> List[torch.Tensor]:
|
| 233 |
+
"""Filters out overlapping detections."""
|
| 234 |
+
filtered_detections = []
|
| 235 |
+
for i in range(len(detections)):
|
| 236 |
+
faces = self._weighted_non_max_suppression(detections[i])
|
| 237 |
+
faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, 17), device=self._device())
|
| 238 |
+
filtered_detections.append(faces)
|
| 239 |
+
|
| 240 |
+
return filtered_detections
|
| 241 |
+
|
| 242 |
+
def _tensors_to_detections(self, raw_box_tensor: torch.Tensor, raw_score_tensor: torch.Tensor, anchors) -> List[
|
| 243 |
+
torch.Tensor]:
|
| 244 |
+
"""The output of the neural network is a tensor of shape (b, 896, 16)
|
| 245 |
+
containing the bounding box regressor predictions, as well as a tensor
|
| 246 |
+
of shape (b, 896, 1) with the classification confidences.
|
| 247 |
+
|
| 248 |
+
This function converts these two "raw" tensors into proper detections.
|
| 249 |
+
Returns a list of (num_detections, 17) tensors, one for each image in
|
| 250 |
+
the batch.
|
| 251 |
+
|
| 252 |
+
This is based on the source code from:
|
| 253 |
+
mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
|
| 254 |
+
mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
|
| 255 |
+
"""
|
| 256 |
+
assert raw_box_tensor.ndimension() == 3
|
| 257 |
+
assert raw_box_tensor.shape[1] == self.num_anchors
|
| 258 |
+
assert raw_box_tensor.shape[2] == self.num_coords
|
| 259 |
+
|
| 260 |
+
assert raw_score_tensor.ndimension() == 3
|
| 261 |
+
assert raw_score_tensor.shape[1] == self.num_anchors
|
| 262 |
+
assert raw_score_tensor.shape[2] == self.num_classes
|
| 263 |
+
|
| 264 |
+
assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
|
| 265 |
+
|
| 266 |
+
detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
|
| 267 |
+
|
| 268 |
+
thresh = self.score_clipping_thresh
|
| 269 |
+
raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
|
| 270 |
+
detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
|
| 271 |
+
|
| 272 |
+
# Note: we stripped off the last dimension from the scores tensor
|
| 273 |
+
# because there is only has one class. Now we can simply use a mask
|
| 274 |
+
# to filter out the boxes with too low confidence.
|
| 275 |
+
mask = detection_scores >= self.min_score_thresh
|
| 276 |
+
|
| 277 |
+
# Because each image from the batch can have a different number of
|
| 278 |
+
# detections, process them one at a time using a loop.
|
| 279 |
+
output_detections = []
|
| 280 |
+
for i in range(raw_box_tensor.shape[0]):
|
| 281 |
+
boxes = detection_boxes[i, mask[i]]
|
| 282 |
+
scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
|
| 283 |
+
output_detections.append(torch.cat((boxes, scores), dim=-1))
|
| 284 |
+
|
| 285 |
+
return output_detections
|
| 286 |
+
|
| 287 |
+
def _decode_boxes(self, raw_boxes, anchors):
|
| 288 |
+
"""Converts the predictions into actual coordinates using
|
| 289 |
+
the anchor boxes. Processes the entire batch at once.
|
| 290 |
+
"""
|
| 291 |
+
boxes = torch.zeros_like(raw_boxes)
|
| 292 |
+
|
| 293 |
+
x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
| 294 |
+
y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
| 295 |
+
|
| 296 |
+
w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
|
| 297 |
+
h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
|
| 298 |
+
|
| 299 |
+
boxes[..., 0] = y_center - h / 2. # ymin
|
| 300 |
+
boxes[..., 1] = x_center - w / 2. # xmin
|
| 301 |
+
boxes[..., 2] = y_center + h / 2. # ymax
|
| 302 |
+
boxes[..., 3] = x_center + w / 2. # xmax
|
| 303 |
+
|
| 304 |
+
for k in range(6):
|
| 305 |
+
offset = 4 + k * 2
|
| 306 |
+
keypoint_x = raw_boxes[..., offset] / self.x_scale * anchors[:, 2] + anchors[:, 0]
|
| 307 |
+
keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
|
| 308 |
+
boxes[..., offset] = keypoint_x
|
| 309 |
+
boxes[..., offset + 1] = keypoint_y
|
| 310 |
+
|
| 311 |
+
return boxes
|
| 312 |
+
|
| 313 |
+
def _weighted_non_max_suppression(self, detections):
|
| 314 |
+
"""The alternative NMS method as mentioned in the BlazeFace paper:
|
| 315 |
+
|
| 316 |
+
"We replace the suppression algorithm with a blending strategy that
|
| 317 |
+
estimates the regression parameters of a bounding box as a weighted
|
| 318 |
+
mean between the overlapping predictions."
|
| 319 |
+
|
| 320 |
+
The original MediaPipe code assigns the score of the most confident
|
| 321 |
+
detection to the weighted detection, but we take the average score
|
| 322 |
+
of the overlapping detections.
|
| 323 |
+
|
| 324 |
+
The input detections should be a Tensor of shape (count, 17).
|
| 325 |
+
|
| 326 |
+
Returns a list of PyTorch tensors, one for each detected face.
|
| 327 |
+
|
| 328 |
+
This is based on the source code from:
|
| 329 |
+
mediapipe/calculators/util/non_max_suppression_calculator.cc
|
| 330 |
+
mediapipe/calculators/util/non_max_suppression_calculator.proto
|
| 331 |
+
"""
|
| 332 |
+
if len(detections) == 0: return []
|
| 333 |
+
|
| 334 |
+
output_detections = []
|
| 335 |
+
|
| 336 |
+
# Sort the detections from highest to lowest score.
|
| 337 |
+
remaining = torch.argsort(detections[:, 16], descending=True)
|
| 338 |
+
|
| 339 |
+
while len(remaining) > 0:
|
| 340 |
+
detection = detections[remaining[0]]
|
| 341 |
+
|
| 342 |
+
# Compute the overlap between the first box and the other
|
| 343 |
+
# remaining boxes. (Note that the other_boxes also include
|
| 344 |
+
# the first_box.)
|
| 345 |
+
first_box = detection[:4]
|
| 346 |
+
other_boxes = detections[remaining, :4]
|
| 347 |
+
ious = overlap_similarity(first_box, other_boxes)
|
| 348 |
+
|
| 349 |
+
# If two detections don't overlap enough, they are considered
|
| 350 |
+
# to be from different faces.
|
| 351 |
+
mask = ious > self.min_suppression_threshold
|
| 352 |
+
overlapping = remaining[mask]
|
| 353 |
+
remaining = remaining[~mask]
|
| 354 |
+
|
| 355 |
+
# Take an average of the coordinates from the overlapping
|
| 356 |
+
# detections, weighted by their confidence scores.
|
| 357 |
+
weighted_detection = detection.clone()
|
| 358 |
+
if len(overlapping) > 1:
|
| 359 |
+
coordinates = detections[overlapping, :16]
|
| 360 |
+
scores = detections[overlapping, 16:17]
|
| 361 |
+
total_score = scores.sum()
|
| 362 |
+
weighted = (coordinates * scores).sum(dim=0) / total_score
|
| 363 |
+
weighted_detection[:16] = weighted
|
| 364 |
+
weighted_detection[16] = total_score / len(overlapping)
|
| 365 |
+
|
| 366 |
+
output_detections.append(weighted_detection)
|
| 367 |
+
|
| 368 |
+
return output_detections
|
| 369 |
+
|
| 370 |
+
# IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def intersect(box_a, box_b):
|
| 374 |
+
""" We resize both tensors to [A,B,2] without new malloc:
|
| 375 |
+
[A,2] -> [A,1,2] -> [A,B,2]
|
| 376 |
+
[B,2] -> [1,B,2] -> [A,B,2]
|
| 377 |
+
Then we compute the area of intersect between box_a and box_b.
|
| 378 |
+
Args:
|
| 379 |
+
box_a: (tensor) bounding boxes, Shape: [A,4].
|
| 380 |
+
box_b: (tensor) bounding boxes, Shape: [B,4].
|
| 381 |
+
Return:
|
| 382 |
+
(tensor) intersection area, Shape: [A,B].
|
| 383 |
+
"""
|
| 384 |
+
A = box_a.size(0)
|
| 385 |
+
B = box_b.size(0)
|
| 386 |
+
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
|
| 387 |
+
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
|
| 388 |
+
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
|
| 389 |
+
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
|
| 390 |
+
inter = torch.clamp((max_xy - min_xy), min=0)
|
| 391 |
+
return inter[:, :, 0] * inter[:, :, 1]
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def jaccard(box_a, box_b):
|
| 395 |
+
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
|
| 396 |
+
is simply the intersection over union of two boxes. Here we operate on
|
| 397 |
+
ground truth boxes and default boxes.
|
| 398 |
+
E.g.:
|
| 399 |
+
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
|
| 400 |
+
Args:
|
| 401 |
+
box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
|
| 402 |
+
box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
|
| 403 |
+
Return:
|
| 404 |
+
jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
|
| 405 |
+
"""
|
| 406 |
+
inter = intersect(box_a, box_b)
|
| 407 |
+
area_a = ((box_a[:, 2] - box_a[:, 0]) *
|
| 408 |
+
(box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
|
| 409 |
+
area_b = ((box_b[:, 2] - box_b[:, 0]) *
|
| 410 |
+
(box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
|
| 411 |
+
union = area_a + area_b - inter
|
| 412 |
+
return inter / union # [A,B]
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
def overlap_similarity(box, other_boxes):
|
| 416 |
+
"""Computes the IOU between a bounding box and set of other boxes."""
|
| 417 |
+
return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
|
models/icpr2020dfdc/blazeface/face_extract.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Tuple, List
|
| 3 |
+
|
| 4 |
+
import cv2
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
from PIL import Image
|
| 8 |
+
|
| 9 |
+
from blazeface import BlazeFace
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class FaceExtractor:
|
| 13 |
+
"""Wrapper for face extraction workflow."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, video_read_fn = None, facedet: BlazeFace = None):
|
| 16 |
+
"""Creates a new FaceExtractor.
|
| 17 |
+
|
| 18 |
+
Arguments:
|
| 19 |
+
video_read_fn: a function that takes in a path to a video file
|
| 20 |
+
and returns a tuple consisting of a NumPy array with shape
|
| 21 |
+
(num_frames, H, W, 3) and a list of frame indices, or None
|
| 22 |
+
in case of an error
|
| 23 |
+
facedet: the face detector object
|
| 24 |
+
"""
|
| 25 |
+
self.video_read_fn = video_read_fn
|
| 26 |
+
self.facedet = facedet
|
| 27 |
+
|
| 28 |
+
def process_image(self, path: str = None, img: Image.Image or np.ndarray = None) -> dict:
|
| 29 |
+
"""
|
| 30 |
+
Process a single image
|
| 31 |
+
:param path: Path to the image
|
| 32 |
+
:param img: image
|
| 33 |
+
:return:
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
if img is not None and path is not None:
|
| 37 |
+
raise ValueError('Only one argument between path and img can be specified')
|
| 38 |
+
if img is None and path is None:
|
| 39 |
+
raise ValueError('At least one argument between path and img must be specified')
|
| 40 |
+
|
| 41 |
+
target_size = self.facedet.input_size
|
| 42 |
+
|
| 43 |
+
if img is None:
|
| 44 |
+
img = np.asarray(Image.open(str(path)))
|
| 45 |
+
else:
|
| 46 |
+
img = np.asarray(img)
|
| 47 |
+
|
| 48 |
+
# Split the frames into several tiles. Resize the tiles to 128x128.
|
| 49 |
+
tiles, resize_info = self._tile_frames(np.expand_dims(img, 0), target_size)
|
| 50 |
+
# tiles has shape (num_tiles, target_size, target_size, 3)
|
| 51 |
+
# resize_info is a list of four elements [resize_factor_y, resize_factor_x, 0, 0]
|
| 52 |
+
|
| 53 |
+
# Run the face detector. The result is a list of PyTorch tensors,
|
| 54 |
+
# one for each tile in the batch.
|
| 55 |
+
detections = self.facedet.predict_on_batch(tiles, apply_nms=False)
|
| 56 |
+
|
| 57 |
+
# Convert the detections from 128x128 back to the original frame size.
|
| 58 |
+
detections = self._resize_detections(detections, target_size, resize_info)
|
| 59 |
+
|
| 60 |
+
# Because we have several tiles for each frame, combine the predictions
|
| 61 |
+
# from these tiles. The result is a list of PyTorch tensors, but now one
|
| 62 |
+
# for each frame (rather than each tile).
|
| 63 |
+
num_frames = 1
|
| 64 |
+
frame_size = (img.shape[1], img.shape[0])
|
| 65 |
+
detections = self._untile_detections(num_frames, frame_size, detections)
|
| 66 |
+
|
| 67 |
+
# The same face may have been detected in multiple tiles, so filter out
|
| 68 |
+
# overlapping detections. This is done separately for each frame.
|
| 69 |
+
detections = self.facedet.nms(detections)
|
| 70 |
+
|
| 71 |
+
# Crop the faces out of the original frame.
|
| 72 |
+
frameref_detections = self._add_margin_to_detections(detections[0], frame_size, 0.2)
|
| 73 |
+
faces = self._crop_faces(img, frameref_detections)
|
| 74 |
+
kpts = self._crop_kpts(img, detections[0], 0.3)
|
| 75 |
+
|
| 76 |
+
# Add additional information about the frame and detections.
|
| 77 |
+
scores = list(detections[0][:, 16].cpu().numpy())
|
| 78 |
+
frame_dict = {"frame_w": frame_size[0],
|
| 79 |
+
"frame_h": frame_size[1],
|
| 80 |
+
"faces": faces,
|
| 81 |
+
"kpts": kpts,
|
| 82 |
+
"detections": frameref_detections.cpu().numpy(),
|
| 83 |
+
"scores": scores,
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# Sort faces by descending confidence
|
| 87 |
+
frame_dict = self._soft_faces_by_descending_score(frame_dict)
|
| 88 |
+
|
| 89 |
+
return frame_dict
|
| 90 |
+
|
| 91 |
+
def _soft_faces_by_descending_score(self, frame_dict: dict) -> dict:
|
| 92 |
+
if len(frame_dict['scores']) > 1:
|
| 93 |
+
sort_idxs = np.argsort(frame_dict['scores'])[::-1]
|
| 94 |
+
new_faces = [frame_dict['faces'][i] for i in sort_idxs]
|
| 95 |
+
new_kpts = [frame_dict['kpts'][i] for i in sort_idxs]
|
| 96 |
+
new_detections = frame_dict['detections'][sort_idxs]
|
| 97 |
+
new_scores = [frame_dict['scores'][i] for i in sort_idxs]
|
| 98 |
+
frame_dict['faces'] = new_faces
|
| 99 |
+
frame_dict['kpts'] = new_kpts
|
| 100 |
+
frame_dict['detections'] = new_detections
|
| 101 |
+
frame_dict['scores'] = new_scores
|
| 102 |
+
return frame_dict
|
| 103 |
+
|
| 104 |
+
def process_videos(self, input_dir, filenames, video_idxs) -> List[dict]:
|
| 105 |
+
"""For the specified selection of videos, grabs one or more frames
|
| 106 |
+
from each video, runs the face detector, and tries to find the faces
|
| 107 |
+
in each frame.
|
| 108 |
+
|
| 109 |
+
The frames are split into tiles, and the tiles from the different videos
|
| 110 |
+
are concatenated into a single batch. This means the face detector gets
|
| 111 |
+
a batch of size len(video_idxs) * num_frames * num_tiles (usually 3).
|
| 112 |
+
|
| 113 |
+
Arguments:
|
| 114 |
+
input_dir: base folder where the video files are stored
|
| 115 |
+
filenames: list of all video files in the input_dir
|
| 116 |
+
video_idxs: one or more indices from the filenames list; these
|
| 117 |
+
are the videos we'll actually process
|
| 118 |
+
|
| 119 |
+
Returns a list of dictionaries, one for each frame read from each video.
|
| 120 |
+
|
| 121 |
+
This dictionary contains:
|
| 122 |
+
- video_idx: the video this frame was taken from
|
| 123 |
+
- frame_idx: the index of the frame in the video
|
| 124 |
+
- frame_w, frame_h: original dimensions of the frame
|
| 125 |
+
- faces: a list containing zero or more NumPy arrays with a face crop
|
| 126 |
+
- scores: a list array with the confidence score for each face crop
|
| 127 |
+
|
| 128 |
+
If reading a video failed for some reason, it will not appear in the
|
| 129 |
+
output array. Note that there's no guarantee a given video will actually
|
| 130 |
+
have num_frames results (as soon as a reading problem is encountered for
|
| 131 |
+
a video, we continue with the next video).
|
| 132 |
+
"""
|
| 133 |
+
target_size = self.facedet.input_size
|
| 134 |
+
|
| 135 |
+
videos_read = []
|
| 136 |
+
frames_read = []
|
| 137 |
+
frames = []
|
| 138 |
+
tiles = []
|
| 139 |
+
resize_info = []
|
| 140 |
+
|
| 141 |
+
for video_idx in video_idxs:
|
| 142 |
+
# Read the full-size frames from this video.
|
| 143 |
+
filename = filenames[video_idx]
|
| 144 |
+
video_path = os.path.join(input_dir, filename)
|
| 145 |
+
result = self.video_read_fn(video_path)
|
| 146 |
+
|
| 147 |
+
# Error? Then skip this video.
|
| 148 |
+
if result is None: continue
|
| 149 |
+
|
| 150 |
+
videos_read.append(video_idx)
|
| 151 |
+
|
| 152 |
+
# Keep track of the original frames (need them later).
|
| 153 |
+
my_frames, my_idxs = result
|
| 154 |
+
frames.append(my_frames)
|
| 155 |
+
frames_read.append(my_idxs)
|
| 156 |
+
|
| 157 |
+
# Split the frames into several tiles. Resize the tiles to 128x128.
|
| 158 |
+
my_tiles, my_resize_info = self._tile_frames(my_frames, target_size)
|
| 159 |
+
tiles.append(my_tiles)
|
| 160 |
+
resize_info.append(my_resize_info)
|
| 161 |
+
|
| 162 |
+
if len(tiles) == 0:
|
| 163 |
+
return []
|
| 164 |
+
# Put all the tiles for all the frames from all the videos into
|
| 165 |
+
# a single batch.
|
| 166 |
+
batch = np.concatenate(tiles)
|
| 167 |
+
|
| 168 |
+
# Run the face detector. The result is a list of PyTorch tensors,
|
| 169 |
+
# one for each image in the batch.
|
| 170 |
+
all_detections = self.facedet.predict_on_batch(batch, apply_nms=False)
|
| 171 |
+
|
| 172 |
+
result = []
|
| 173 |
+
offs = 0
|
| 174 |
+
for v in range(len(tiles)):
|
| 175 |
+
# Not all videos may have the same number of tiles, so find which
|
| 176 |
+
# detections go with which video.
|
| 177 |
+
num_tiles = tiles[v].shape[0]
|
| 178 |
+
detections = all_detections[offs:offs + num_tiles]
|
| 179 |
+
offs += num_tiles
|
| 180 |
+
|
| 181 |
+
# Convert the detections from 128x128 back to the original frame size.
|
| 182 |
+
detections = self._resize_detections(detections, target_size, resize_info[v])
|
| 183 |
+
|
| 184 |
+
# Because we have several tiles for each frame, combine the predictions
|
| 185 |
+
# from these tiles. The result is a list of PyTorch tensors, but now one
|
| 186 |
+
# for each frame (rather than each tile).
|
| 187 |
+
num_frames = frames[v].shape[0]
|
| 188 |
+
frame_size = (frames[v].shape[2], frames[v].shape[1])
|
| 189 |
+
detections = self._untile_detections(num_frames, frame_size, detections)
|
| 190 |
+
|
| 191 |
+
# The same face may have been detected in multiple tiles, so filter out
|
| 192 |
+
# overlapping detections. This is done separately for each frame.
|
| 193 |
+
detections = self.facedet.nms(detections)
|
| 194 |
+
|
| 195 |
+
for i in range(len(detections)):
|
| 196 |
+
# Crop the faces out of the original frame.
|
| 197 |
+
frameref_detections = self._add_margin_to_detections(detections[i], frame_size, 0.2)
|
| 198 |
+
faces = self._crop_faces(frames[v][i], frameref_detections)
|
| 199 |
+
kpts = self._crop_kpts(frames[v][i], detections[i], 0.3)
|
| 200 |
+
|
| 201 |
+
# Add additional information about the frame and detections.
|
| 202 |
+
scores = list(detections[i][:, 16].cpu().numpy())
|
| 203 |
+
frame_dict = {"video_idx": videos_read[v],
|
| 204 |
+
"frame_idx": frames_read[v][i],
|
| 205 |
+
"frame_w": frame_size[0],
|
| 206 |
+
"frame_h": frame_size[1],
|
| 207 |
+
"frame": frames[v][i],
|
| 208 |
+
"faces": faces,
|
| 209 |
+
"kpts": kpts,
|
| 210 |
+
"detections": frameref_detections.cpu().numpy(),
|
| 211 |
+
"scores": scores,
|
| 212 |
+
}
|
| 213 |
+
# Sort faces by descending confidence
|
| 214 |
+
frame_dict = self._soft_faces_by_descending_score(frame_dict)
|
| 215 |
+
|
| 216 |
+
result.append(frame_dict)
|
| 217 |
+
|
| 218 |
+
return result
|
| 219 |
+
|
| 220 |
+
def process_video(self, video_path):
|
| 221 |
+
"""Convenience method for doing face extraction on a single video."""
|
| 222 |
+
input_dir = os.path.dirname(video_path)
|
| 223 |
+
filenames = [os.path.basename(video_path)]
|
| 224 |
+
return self.process_videos(input_dir, filenames, [0])
|
| 225 |
+
|
| 226 |
+
def _tile_frames(self, frames: np.ndarray, target_size: Tuple[int, int]) -> (np.ndarray, List[float]):
|
| 227 |
+
"""Splits each frame into several smaller, partially overlapping tiles
|
| 228 |
+
and resizes each tile to target_size.
|
| 229 |
+
|
| 230 |
+
After a bunch of experimentation, I found that for a 1920x1080 video,
|
| 231 |
+
BlazeFace works better on three 1080x1080 windows. These overlap by 420
|
| 232 |
+
pixels. (Two windows also work but it's best to have a clean center crop
|
| 233 |
+
in there as well.)
|
| 234 |
+
|
| 235 |
+
I also tried 6 windows of size 720x720 (horizontally: 720|360, 360|720;
|
| 236 |
+
vertically: 720|1200, 480|720|480, 1200|720) but that gives many false
|
| 237 |
+
positives when a window has no face in it.
|
| 238 |
+
|
| 239 |
+
For a video in portrait orientation (1080x1920), we only take a single
|
| 240 |
+
crop of the top-most 1080 pixels. If we split up the video vertically,
|
| 241 |
+
then we might get false positives again.
|
| 242 |
+
|
| 243 |
+
(NOTE: Not all videos are necessarily 1080p but the code can handle this.)
|
| 244 |
+
|
| 245 |
+
Arguments:
|
| 246 |
+
frames: NumPy array of shape (num_frames, height, width, 3)
|
| 247 |
+
target_size: (width, height)
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
- a new (num_frames * N, target_size[1], target_size[0], 3) array
|
| 251 |
+
where N is the number of tiles used.
|
| 252 |
+
- a list [scale_w, scale_h, offset_x, offset_y] that describes how
|
| 253 |
+
to map the resized and cropped tiles back to the original image
|
| 254 |
+
coordinates. This is needed for scaling up the face detections
|
| 255 |
+
from the smaller image to the original image, so we can take the
|
| 256 |
+
face crops in the original coordinate space.
|
| 257 |
+
"""
|
| 258 |
+
num_frames, H, W, _ = frames.shape
|
| 259 |
+
|
| 260 |
+
num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)
|
| 261 |
+
|
| 262 |
+
splits = np.zeros((num_frames * num_v * num_h, target_size[1], target_size[0], 3), dtype=np.uint8)
|
| 263 |
+
|
| 264 |
+
i = 0
|
| 265 |
+
for f in range(num_frames):
|
| 266 |
+
y = 0
|
| 267 |
+
for v in range(num_v):
|
| 268 |
+
x = 0
|
| 269 |
+
for h in range(num_h):
|
| 270 |
+
crop = frames[f, y:y + split_size, x:x + split_size, :]
|
| 271 |
+
splits[i] = cv2.resize(crop, target_size, interpolation=cv2.INTER_AREA)
|
| 272 |
+
x += x_step
|
| 273 |
+
i += 1
|
| 274 |
+
y += y_step
|
| 275 |
+
|
| 276 |
+
resize_info = [split_size / target_size[0], split_size / target_size[1], 0, 0]
|
| 277 |
+
return splits, resize_info
|
| 278 |
+
|
| 279 |
+
def get_tiles_params(self, H, W):
|
| 280 |
+
split_size = min(H, W, 720)
|
| 281 |
+
x_step = (W - split_size) // 2
|
| 282 |
+
y_step = (H - split_size) // 2
|
| 283 |
+
num_v = (H - split_size) // y_step + 1 if y_step > 0 else 1
|
| 284 |
+
num_h = (W - split_size) // x_step + 1 if x_step > 0 else 1
|
| 285 |
+
return num_h, num_v, split_size, x_step, y_step
|
| 286 |
+
|
| 287 |
+
def _resize_detections(self, detections, target_size, resize_info):
|
| 288 |
+
"""Converts a list of face detections back to the original
|
| 289 |
+
coordinate system.
|
| 290 |
+
|
| 291 |
+
Arguments:
|
| 292 |
+
detections: a list containing PyTorch tensors of shape (num_faces, 17)
|
| 293 |
+
target_size: (width, height)
|
| 294 |
+
resize_info: [scale_w, scale_h, offset_x, offset_y]
|
| 295 |
+
"""
|
| 296 |
+
projected = []
|
| 297 |
+
target_w, target_h = target_size
|
| 298 |
+
scale_w, scale_h, offset_x, offset_y = resize_info
|
| 299 |
+
|
| 300 |
+
for i in range(len(detections)):
|
| 301 |
+
detection = detections[i].clone()
|
| 302 |
+
|
| 303 |
+
# ymin, xmin, ymax, xmax
|
| 304 |
+
for k in range(2):
|
| 305 |
+
detection[:, k * 2] = (detection[:, k * 2] * target_h - offset_y) * scale_h
|
| 306 |
+
detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_w - offset_x) * scale_w
|
| 307 |
+
|
| 308 |
+
# keypoints are x,y
|
| 309 |
+
for k in range(2, 8):
|
| 310 |
+
detection[:, k * 2] = (detection[:, k * 2] * target_w - offset_x) * scale_w
|
| 311 |
+
detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_h - offset_y) * scale_h
|
| 312 |
+
|
| 313 |
+
projected.append(detection)
|
| 314 |
+
|
| 315 |
+
return projected
|
| 316 |
+
|
| 317 |
+
def _untile_detections(self, num_frames: int, frame_size: Tuple[int, int], detections: List[torch.Tensor]) -> List[
|
| 318 |
+
torch.Tensor]:
|
| 319 |
+
"""With N tiles per frame, there also are N times as many detections.
|
| 320 |
+
This function groups together the detections for a given frame; it is
|
| 321 |
+
the complement to tile_frames().
|
| 322 |
+
"""
|
| 323 |
+
combined_detections = []
|
| 324 |
+
|
| 325 |
+
W, H = frame_size
|
| 326 |
+
|
| 327 |
+
num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)
|
| 328 |
+
|
| 329 |
+
i = 0
|
| 330 |
+
for f in range(num_frames):
|
| 331 |
+
detections_for_frame = []
|
| 332 |
+
y = 0
|
| 333 |
+
for v in range(num_v):
|
| 334 |
+
x = 0
|
| 335 |
+
for h in range(num_h):
|
| 336 |
+
# Adjust the coordinates based on the split positions.
|
| 337 |
+
detection = detections[i].clone()
|
| 338 |
+
if detection.shape[0] > 0:
|
| 339 |
+
for k in range(2):
|
| 340 |
+
detection[:, k * 2] += y
|
| 341 |
+
detection[:, k * 2 + 1] += x
|
| 342 |
+
for k in range(2, 8):
|
| 343 |
+
detection[:, k * 2] += x
|
| 344 |
+
detection[:, k * 2 + 1] += y
|
| 345 |
+
|
| 346 |
+
detections_for_frame.append(detection)
|
| 347 |
+
x += x_step
|
| 348 |
+
i += 1
|
| 349 |
+
y += y_step
|
| 350 |
+
|
| 351 |
+
combined_detections.append(torch.cat(detections_for_frame))
|
| 352 |
+
|
| 353 |
+
return combined_detections
|
| 354 |
+
|
| 355 |
+
def _add_margin_to_detections(self, detections: torch.Tensor, frame_size: Tuple[int, int],
|
| 356 |
+
margin: float = 0.2) -> torch.Tensor:
|
| 357 |
+
"""Expands the face bounding box.
|
| 358 |
+
|
| 359 |
+
NOTE: The face detections often do not include the forehead, which
|
| 360 |
+
is why we use twice the margin for ymin.
|
| 361 |
+
|
| 362 |
+
Arguments:
|
| 363 |
+
detections: a PyTorch tensor of shape (num_detections, 17)
|
| 364 |
+
frame_size: maximum (width, height)
|
| 365 |
+
margin: a percentage of the bounding box's height
|
| 366 |
+
|
| 367 |
+
Returns a PyTorch tensor of shape (num_detections, 17).
|
| 368 |
+
"""
|
| 369 |
+
offset = torch.round(margin * (detections[:, 2] - detections[:, 0]))
|
| 370 |
+
detections = detections.clone()
|
| 371 |
+
detections[:, 0] = torch.clamp(detections[:, 0] - offset * 2, min=0) # ymin
|
| 372 |
+
detections[:, 1] = torch.clamp(detections[:, 1] - offset, min=0) # xmin
|
| 373 |
+
detections[:, 2] = torch.clamp(detections[:, 2] + offset, max=frame_size[1]) # ymax
|
| 374 |
+
detections[:, 3] = torch.clamp(detections[:, 3] + offset, max=frame_size[0]) # xmax
|
| 375 |
+
return detections
|
| 376 |
+
|
| 377 |
+
def _crop_faces(self, frame: np.ndarray, detections: torch.Tensor) -> List[np.ndarray]:
|
| 378 |
+
"""Copies the face region(s) from the given frame into a set
|
| 379 |
+
of new NumPy arrays.
|
| 380 |
+
|
| 381 |
+
Arguments:
|
| 382 |
+
frame: a NumPy array of shape (H, W, 3)
|
| 383 |
+
detections: a PyTorch tensor of shape (num_detections, 17)
|
| 384 |
+
|
| 385 |
+
Returns a list of NumPy arrays, one for each face crop. If there
|
| 386 |
+
are no faces detected for this frame, returns an empty list.
|
| 387 |
+
"""
|
| 388 |
+
faces = []
|
| 389 |
+
for i in range(len(detections)):
|
| 390 |
+
ymin, xmin, ymax, xmax = detections[i, :4].cpu().numpy().astype(int)
|
| 391 |
+
face = frame[ymin:ymax, xmin:xmax, :]
|
| 392 |
+
faces.append(face)
|
| 393 |
+
return faces
|
| 394 |
+
|
| 395 |
+
def _crop_kpts(self, frame: np.ndarray, detections: torch.Tensor, face_fraction: float):
|
| 396 |
+
"""Copies the parts region(s) from the given frame into a set
|
| 397 |
+
of new NumPy arrays.
|
| 398 |
+
|
| 399 |
+
Arguments:
|
| 400 |
+
frame: a NumPy array of shape (H, W, 3)
|
| 401 |
+
detections: a PyTorch tensor of shape (num_detections, 17)
|
| 402 |
+
face_fraction: float between 0 and 1 indicating how big are the parts to be extracted w.r.t the whole face
|
| 403 |
+
|
| 404 |
+
Returns a list of NumPy arrays, one for each face crop. If there
|
| 405 |
+
are no faces detected for this frame, returns an empty list.
|
| 406 |
+
"""
|
| 407 |
+
faces = []
|
| 408 |
+
for i in range(len(detections)):
|
| 409 |
+
kpts = []
|
| 410 |
+
size = int(face_fraction * min(detections[i, 2] - detections[i, 0], detections[i, 3] - detections[i, 1]))
|
| 411 |
+
kpts_coords = detections[i, 4:16].cpu().numpy().astype(int)
|
| 412 |
+
for kpidx in range(6):
|
| 413 |
+
kpx, kpy = kpts_coords[kpidx * 2:kpidx * 2 + 2]
|
| 414 |
+
kpt = frame[kpy - size // 2:kpy - size // 2 + size, kpx - size // 2:kpx - size // 2 + size, ]
|
| 415 |
+
kpts.append(kpt)
|
| 416 |
+
faces.append(kpts)
|
| 417 |
+
return faces
|
| 418 |
+
|
| 419 |
+
def remove_large_crops(self, crops, pct=0.1):
|
| 420 |
+
"""Removes faces from the results if they take up more than X%
|
| 421 |
+
of the video. Such a face is likely a false positive.
|
| 422 |
+
|
| 423 |
+
This is an optional postprocessing step. Modifies the original
|
| 424 |
+
data structure.
|
| 425 |
+
|
| 426 |
+
Arguments:
|
| 427 |
+
crops: a list of dictionaries with face crop data
|
| 428 |
+
pct: maximum portion of the frame a crop may take up
|
| 429 |
+
"""
|
| 430 |
+
for i in range(len(crops)):
|
| 431 |
+
frame_data = crops[i]
|
| 432 |
+
video_area = frame_data["frame_w"] * frame_data["frame_h"]
|
| 433 |
+
faces = frame_data["faces"]
|
| 434 |
+
scores = frame_data["scores"]
|
| 435 |
+
new_faces = []
|
| 436 |
+
new_scores = []
|
| 437 |
+
for j in range(len(faces)):
|
| 438 |
+
face = faces[j]
|
| 439 |
+
face_H, face_W, _ = face.shape
|
| 440 |
+
face_area = face_H * face_W
|
| 441 |
+
if face_area / video_area < 0.1:
|
| 442 |
+
new_faces.append(face)
|
| 443 |
+
new_scores.append(scores[j])
|
| 444 |
+
frame_data["faces"] = new_faces
|
| 445 |
+
frame_data["scores"] = new_scores
|
| 446 |
+
|
| 447 |
+
def keep_only_best_face(self, crops):
|
| 448 |
+
"""For each frame, only keeps the face with the highest confidence.
|
| 449 |
+
|
| 450 |
+
This gets rid of false positives, but obviously is problematic for
|
| 451 |
+
videos with two people!
|
| 452 |
+
|
| 453 |
+
This is an optional postprocessing step. Modifies the original
|
| 454 |
+
data structure.
|
| 455 |
+
"""
|
| 456 |
+
for i in range(len(crops)):
|
| 457 |
+
frame_data = crops[i]
|
| 458 |
+
if len(frame_data["faces"]) > 0:
|
| 459 |
+
frame_data["faces"] = frame_data["faces"][:1]
|
| 460 |
+
frame_data["scores"] = frame_data["scores"][:1]
|
| 461 |
+
|
| 462 |
+
# TODO: def filter_likely_false_positives(self, crops):
|
| 463 |
+
# if only some frames have more than 1 face, it's likely a false positive
|
| 464 |
+
# if most frames have more than 1 face, it's probably two people
|
| 465 |
+
# so find the % of frames with > 1 face; if > 0.X, keep the two best faces
|
| 466 |
+
|
| 467 |
+
# TODO: def filter_by_score(self, crops, min_score) to remove any
|
| 468 |
+
# crops with a confidence score lower than min_score
|
| 469 |
+
|
| 470 |
+
# TODO: def sort_by_histogram(self, crops) for videos with 2 people.
|
models/icpr2020dfdc/blazeface/read_video.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class VideoReader:
|
| 6 |
+
"""Helper class for reading one or more frames from a video file."""
|
| 7 |
+
|
| 8 |
+
def __init__(self, verbose=True, insets=(0, 0)):
|
| 9 |
+
"""Creates a new VideoReader.
|
| 10 |
+
|
| 11 |
+
Arguments:
|
| 12 |
+
verbose: whether to print warnings and error messages
|
| 13 |
+
insets: amount to inset the image by, as a percentage of
|
| 14 |
+
(width, height). This lets you "zoom in" to an image
|
| 15 |
+
to remove unimportant content around the borders.
|
| 16 |
+
Useful for face detection, which may not work if the
|
| 17 |
+
faces are too small.
|
| 18 |
+
"""
|
| 19 |
+
self.verbose = verbose
|
| 20 |
+
self.insets = insets
|
| 21 |
+
|
| 22 |
+
def read_frames(self, path, num_frames, jitter=0, seed=None):
|
| 23 |
+
"""Reads frames that are always evenly spaced throughout the video.
|
| 24 |
+
|
| 25 |
+
Arguments:
|
| 26 |
+
path: the video file
|
| 27 |
+
num_frames: how many frames to read, -1 means the entire video
|
| 28 |
+
(warning: this will take up a lot of memory!)
|
| 29 |
+
jitter: if not 0, adds small random offsets to the frame indices;
|
| 30 |
+
this is useful so we don't always land on even or odd frames
|
| 31 |
+
seed: random seed for jittering; if you set this to a fixed value,
|
| 32 |
+
you probably want to set it only on the first video
|
| 33 |
+
"""
|
| 34 |
+
assert num_frames > 0
|
| 35 |
+
|
| 36 |
+
capture = cv2.VideoCapture(path)
|
| 37 |
+
frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 38 |
+
if frame_count <= 0: return None
|
| 39 |
+
|
| 40 |
+
frame_idxs = np.linspace(0, frame_count - 1, num_frames, endpoint=True, dtype=int)
|
| 41 |
+
frame_idxs = np.unique(frame_idxs) # Avoid repeating frame idxs otherwise it breaks reading
|
| 42 |
+
if jitter > 0:
|
| 43 |
+
np.random.seed(seed)
|
| 44 |
+
jitter_offsets = np.random.randint(-jitter, jitter, len(frame_idxs))
|
| 45 |
+
frame_idxs = np.clip(frame_idxs + jitter_offsets, 0, frame_count - 1)
|
| 46 |
+
|
| 47 |
+
result = self._read_frames_at_indices(path, capture, frame_idxs)
|
| 48 |
+
capture.release()
|
| 49 |
+
return result
|
| 50 |
+
|
| 51 |
+
def read_random_frames(self, path, num_frames, seed=None):
|
| 52 |
+
"""Picks the frame indices at random.
|
| 53 |
+
|
| 54 |
+
Arguments:
|
| 55 |
+
path: the video file
|
| 56 |
+
num_frames: how many frames to read, -1 means the entire video
|
| 57 |
+
(warning: this will take up a lot of memory!)
|
| 58 |
+
"""
|
| 59 |
+
assert num_frames > 0
|
| 60 |
+
np.random.seed(seed)
|
| 61 |
+
|
| 62 |
+
capture = cv2.VideoCapture(path)
|
| 63 |
+
frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 64 |
+
if frame_count <= 0: return None
|
| 65 |
+
|
| 66 |
+
frame_idxs = sorted(np.random.choice(np.arange(0, frame_count), num_frames))
|
| 67 |
+
result = self._read_frames_at_indices(path, capture, frame_idxs)
|
| 68 |
+
|
| 69 |
+
capture.release()
|
| 70 |
+
return result
|
| 71 |
+
|
| 72 |
+
def read_frames_at_indices(self, path, frame_idxs):
|
| 73 |
+
"""Reads frames from a video and puts them into a NumPy array.
|
| 74 |
+
|
| 75 |
+
Arguments:
|
| 76 |
+
path: the video file
|
| 77 |
+
frame_idxs: a list of frame indices. Important: should be
|
| 78 |
+
sorted from low-to-high! If an index appears multiple
|
| 79 |
+
times, the frame is still read only once.
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
- a NumPy array of shape (num_frames, height, width, 3)
|
| 83 |
+
- a list of the frame indices that were read
|
| 84 |
+
|
| 85 |
+
Reading stops if loading a frame fails, in which case the first
|
| 86 |
+
dimension returned may actually be less than num_frames.
|
| 87 |
+
|
| 88 |
+
Returns None if an exception is thrown for any reason, or if no
|
| 89 |
+
frames were read.
|
| 90 |
+
"""
|
| 91 |
+
assert len(frame_idxs) > 0
|
| 92 |
+
capture = cv2.VideoCapture(path)
|
| 93 |
+
result = self._read_frames_at_indices(path, capture, frame_idxs)
|
| 94 |
+
capture.release()
|
| 95 |
+
return result
|
| 96 |
+
|
| 97 |
+
def _read_frames_at_indices(self, path, capture, frame_idxs):
|
| 98 |
+
try:
|
| 99 |
+
frames = []
|
| 100 |
+
idxs_read = []
|
| 101 |
+
for frame_idx in range(frame_idxs[0], frame_idxs[-1] + 1):
|
| 102 |
+
# Get the next frame, but don't decode if we're not using it.
|
| 103 |
+
ret = capture.grab()
|
| 104 |
+
if not ret:
|
| 105 |
+
if self.verbose:
|
| 106 |
+
print("Error grabbing frame %d from movie %s" % (frame_idx, path))
|
| 107 |
+
break
|
| 108 |
+
|
| 109 |
+
# Need to look at this frame?
|
| 110 |
+
current = len(idxs_read)
|
| 111 |
+
if frame_idx == frame_idxs[current]:
|
| 112 |
+
ret, frame = capture.retrieve()
|
| 113 |
+
if not ret or frame is None:
|
| 114 |
+
if self.verbose:
|
| 115 |
+
print("Error retrieving frame %d from movie %s" % (frame_idx, path))
|
| 116 |
+
break
|
| 117 |
+
|
| 118 |
+
frame = self._postprocess_frame(frame)
|
| 119 |
+
frames.append(frame)
|
| 120 |
+
idxs_read.append(frame_idx)
|
| 121 |
+
|
| 122 |
+
if len(frames) > 0:
|
| 123 |
+
return np.stack(frames), idxs_read
|
| 124 |
+
if self.verbose:
|
| 125 |
+
print("No frames read from movie %s" % path)
|
| 126 |
+
return None
|
| 127 |
+
except:
|
| 128 |
+
if self.verbose:
|
| 129 |
+
print("Exception while reading movie %s" % path)
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
def read_middle_frame(self, path):
|
| 133 |
+
"""Reads the frame from the middle of the video."""
|
| 134 |
+
capture = cv2.VideoCapture(path)
|
| 135 |
+
frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 136 |
+
result = self._read_frame_at_index(path, capture, frame_count // 2)
|
| 137 |
+
capture.release()
|
| 138 |
+
return result
|
| 139 |
+
|
| 140 |
+
def read_frame_at_index(self, path, frame_idx):
|
| 141 |
+
"""Reads a single frame from a video.
|
| 142 |
+
|
| 143 |
+
If you just want to read a single frame from the video, this is more
|
| 144 |
+
efficient than scanning through the video to find the frame. However,
|
| 145 |
+
for reading multiple frames it's not efficient.
|
| 146 |
+
|
| 147 |
+
My guess is that a "streaming" approach is more efficient than a
|
| 148 |
+
"random access" approach because, unless you happen to grab a keyframe,
|
| 149 |
+
the decoder still needs to read all the previous frames in order to
|
| 150 |
+
reconstruct the one you're asking for.
|
| 151 |
+
|
| 152 |
+
Returns a NumPy array of shape (1, H, W, 3) and the index of the frame,
|
| 153 |
+
or None if reading failed.
|
| 154 |
+
"""
|
| 155 |
+
capture = cv2.VideoCapture(path)
|
| 156 |
+
result = self._read_frame_at_index(path, capture, frame_idx)
|
| 157 |
+
capture.release()
|
| 158 |
+
return result
|
| 159 |
+
|
| 160 |
+
def _read_frame_at_index(self, path, capture, frame_idx):
|
| 161 |
+
capture.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
|
| 162 |
+
ret, frame = capture.read()
|
| 163 |
+
if not ret or frame is None:
|
| 164 |
+
if self.verbose:
|
| 165 |
+
print("Error retrieving frame %d from movie %s" % (frame_idx, path))
|
| 166 |
+
return None
|
| 167 |
+
else:
|
| 168 |
+
frame = self._postprocess_frame(frame)
|
| 169 |
+
return np.expand_dims(frame, axis=0), [frame_idx]
|
| 170 |
+
|
| 171 |
+
def _postprocess_frame(self, frame):
|
| 172 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 173 |
+
|
| 174 |
+
if self.insets[0] > 0:
|
| 175 |
+
W = frame.shape[1]
|
| 176 |
+
p = int(W * self.insets[0])
|
| 177 |
+
frame = frame[:, p:-p, :]
|
| 178 |
+
|
| 179 |
+
if self.insets[1] > 0:
|
| 180 |
+
H = frame.shape[1]
|
| 181 |
+
q = int(H * self.insets[1])
|
| 182 |
+
frame = frame[q:-q, :, :]
|
| 183 |
+
|
| 184 |
+
return frame
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
class VideoReaderIspl(VideoReader):
|
| 188 |
+
"""
|
| 189 |
+
Derived VideoReader class with overriden read_frames method
|
| 190 |
+
"""
|
| 191 |
+
|
| 192 |
+
def read_frames_with_hop(self, path: str, num_frames: int = -1, fps: int = -1):
|
| 193 |
+
"""Reads frames up to a certain number spaced throughout the video with a rate decided by the user.
|
| 194 |
+
|
| 195 |
+
Arguments:
|
| 196 |
+
path: the video file
|
| 197 |
+
num_frames: how many frames to read, -1 means the entire video
|
| 198 |
+
(warning: this will take up a lot of memory!)
|
| 199 |
+
fps: how many frames per second to pick
|
| 200 |
+
"""
|
| 201 |
+
assert num_frames > 0
|
| 202 |
+
|
| 203 |
+
capture = cv2.VideoCapture(path)
|
| 204 |
+
frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 205 |
+
if frame_count <= 0: return None
|
| 206 |
+
video_rate = capture.get(cv2.CAP_PROP_FPS)
|
| 207 |
+
hop = 1 if fps == -1 else max(video_rate // fps, 1)
|
| 208 |
+
end_pts = frame_count if num_frames == -1 else num_frames * hop
|
| 209 |
+
frame_idxs = np.arange(0, end_pts - 1, hop, endpoint=True, dtype=int)
|
| 210 |
+
|
| 211 |
+
result = self._read_frames_at_indices(path, capture, frame_idxs)
|
| 212 |
+
capture.release()
|
| 213 |
+
return result
|
models/icpr2020dfdc/environment.yml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: icpr2020
|
| 2 |
+
channels:
|
| 3 |
+
- pytorch
|
| 4 |
+
- conda-forge
|
| 5 |
+
- defaults
|
| 6 |
+
dependencies:
|
| 7 |
+
- av=6.2.0
|
| 8 |
+
- albumentations
|
| 9 |
+
- cudatoolkit
|
| 10 |
+
- ffmpeg
|
| 11 |
+
- jupyter
|
| 12 |
+
- numpy
|
| 13 |
+
- opencv=3.4.2
|
| 14 |
+
- py-opencv=3.4.2
|
| 15 |
+
- python=3.6.9
|
| 16 |
+
- pip
|
| 17 |
+
- pytorch=1.4.0
|
| 18 |
+
- torchvision
|
| 19 |
+
- tqdm
|
| 20 |
+
- pandas
|
| 21 |
+
- pip:
|
| 22 |
+
- tensorboardx==2.0
|
| 23 |
+
- efficientnet-pytorch
|
| 24 |
+
- scikit-learn
|
| 25 |
+
|
models/icpr2020dfdc/extract_faces.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Extract faces
|
| 3 |
+
|
| 4 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 5 |
+
|
| 6 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 7 |
+
|
| 8 |
+
Nicolò Bonettini
|
| 9 |
+
Edoardo Daniele Cannas
|
| 10 |
+
Sara Mandelli
|
| 11 |
+
Luca Bondi
|
| 12 |
+
Paolo Bestagini
|
| 13 |
+
"""
|
| 14 |
+
import argparse
|
| 15 |
+
import sys
|
| 16 |
+
import traceback
|
| 17 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 18 |
+
from functools import partial
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Tuple, List
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
import pandas as pd
|
| 24 |
+
import torch
|
| 25 |
+
import torch.cuda
|
| 26 |
+
from PIL import Image
|
| 27 |
+
from tqdm import tqdm
|
| 28 |
+
|
| 29 |
+
import blazeface
|
| 30 |
+
from blazeface import BlazeFace, VideoReader, FaceExtractor
|
| 31 |
+
from isplutils.utils import adapt_bb
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def parse_args(argv):
|
| 35 |
+
parser = argparse.ArgumentParser()
|
| 36 |
+
parser.add_argument('--source', type=Path, help='Videos root directory', required=True)
|
| 37 |
+
parser.add_argument('--videodf', type=Path, help='Path to read the videos DataFrame', required=True)
|
| 38 |
+
parser.add_argument('--facesfolder', type=Path, help='Faces output root directory', required=True)
|
| 39 |
+
parser.add_argument('--facesdf', type=Path, help='Path to save the output DataFrame of faces', required=True)
|
| 40 |
+
parser.add_argument('--checkpoint', type=Path, help='Path to save the temporary per-video outputs', required=True)
|
| 41 |
+
|
| 42 |
+
parser.add_argument('--fpv', type=int, default=32, help='Frames per video')
|
| 43 |
+
parser.add_argument('--device', type=torch.device,
|
| 44 |
+
default=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
|
| 45 |
+
help='Device to use for face extraction')
|
| 46 |
+
parser.add_argument('--collateonly', help='Only perform collation of pre-existing results', action='store_true')
|
| 47 |
+
parser.add_argument('--noindex', help='Do not rebuild the index', action='store_false')
|
| 48 |
+
parser.add_argument('--batch', type=int, help='Batch size', default=16)
|
| 49 |
+
parser.add_argument('--threads', type=int, help='Number of threads', default=8)
|
| 50 |
+
parser.add_argument('--offset', type=int, help='Offset to start extraction', default=0)
|
| 51 |
+
parser.add_argument('--num', type=int, help='Number of videos to process', default=0)
|
| 52 |
+
parser.add_argument('--lazycheck', action='store_true', help='Lazy check of existing video indexes')
|
| 53 |
+
parser.add_argument('--deepcheck', action='store_true', help='Try to open every image')
|
| 54 |
+
|
| 55 |
+
return parser.parse_args(argv)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def main(argv):
|
| 59 |
+
args = parse_args(argv)
|
| 60 |
+
|
| 61 |
+
## Parameters parsing
|
| 62 |
+
device: torch.device = args.device
|
| 63 |
+
source_dir: Path = args.source
|
| 64 |
+
facedestination_dir: Path = args.facesfolder
|
| 65 |
+
frames_per_video: int = args.fpv
|
| 66 |
+
videodataset_path: Path = args.videodf
|
| 67 |
+
facesdataset_path: Path = args.facesdf
|
| 68 |
+
collateonly: bool = args.collateonly
|
| 69 |
+
batch_size: int = args.batch
|
| 70 |
+
threads: int = args.threads
|
| 71 |
+
offset: int = args.offset
|
| 72 |
+
num: int = args.num
|
| 73 |
+
lazycheck: bool = args.lazycheck
|
| 74 |
+
deepcheck: bool = args.deepcheck
|
| 75 |
+
checkpoint_folder: Path = args.checkpoint
|
| 76 |
+
index_enable: bool = args.noindex
|
| 77 |
+
|
| 78 |
+
## Parameters
|
| 79 |
+
face_size = 512
|
| 80 |
+
|
| 81 |
+
print('Loading video DataFrame')
|
| 82 |
+
df_videos = pd.read_pickle(videodataset_path)
|
| 83 |
+
|
| 84 |
+
if num > 0:
|
| 85 |
+
df_videos_process = df_videos.iloc[offset:offset + num]
|
| 86 |
+
else:
|
| 87 |
+
df_videos_process = df_videos.iloc[offset:]
|
| 88 |
+
|
| 89 |
+
if not collateonly:
|
| 90 |
+
|
| 91 |
+
## Blazeface loading
|
| 92 |
+
print('Loading face extractor')
|
| 93 |
+
facedet = BlazeFace().to(device)
|
| 94 |
+
facedet.load_weights("blazeface/blazeface.pth")
|
| 95 |
+
facedet.load_anchors("blazeface/anchors.npy")
|
| 96 |
+
videoreader = VideoReader(verbose=False)
|
| 97 |
+
video_read_fn = lambda x: videoreader.read_frames(x, num_frames=frames_per_video)
|
| 98 |
+
face_extractor = FaceExtractor(video_read_fn, facedet)
|
| 99 |
+
|
| 100 |
+
## Face extraction
|
| 101 |
+
with ThreadPoolExecutor(threads) as p:
|
| 102 |
+
for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos_process), step=batch_size),
|
| 103 |
+
desc='Extracting faces'):
|
| 104 |
+
tosave_list = list(p.map(partial(process_video,
|
| 105 |
+
source_dir=source_dir,
|
| 106 |
+
facedestination_dir=facedestination_dir,
|
| 107 |
+
checkpoint_folder=checkpoint_folder,
|
| 108 |
+
face_size=face_size,
|
| 109 |
+
face_extractor=face_extractor,
|
| 110 |
+
lazycheck=lazycheck,
|
| 111 |
+
deepcheck=deepcheck,
|
| 112 |
+
),
|
| 113 |
+
df_videos_process.iloc[batch_idx0:batch_idx0 + batch_size].iterrows()))
|
| 114 |
+
|
| 115 |
+
for tosave in tosave_list:
|
| 116 |
+
if tosave is not None:
|
| 117 |
+
if len(tosave[2]):
|
| 118 |
+
list(p.map(save_jpg, tosave[2]))
|
| 119 |
+
tosave[1].parent.mkdir(parents=True, exist_ok=True)
|
| 120 |
+
tosave[0].to_pickle(str(tosave[1]))
|
| 121 |
+
|
| 122 |
+
if index_enable:
|
| 123 |
+
# Collect checkpoints
|
| 124 |
+
df_videos['nfaces'] = np.zeros(len(df_videos), np.uint8)
|
| 125 |
+
faces_dataset = []
|
| 126 |
+
for idx, record in tqdm(df_videos.iterrows(), total=len(df_videos), desc='Collecting faces results'):
|
| 127 |
+
# Checkpoint
|
| 128 |
+
video_face_checkpoint_path = checkpoint_folder.joinpath(record['path']).with_suffix('.faces.pkl')
|
| 129 |
+
if video_face_checkpoint_path.exists():
|
| 130 |
+
try:
|
| 131 |
+
df_video_faces = pd.read_pickle(str(video_face_checkpoint_path))
|
| 132 |
+
# Fix same attribute issue
|
| 133 |
+
df_video_faces = df_video_faces.rename(columns={'subject': 'videosubject'}, errors='ignore')
|
| 134 |
+
nfaces = len(
|
| 135 |
+
np.unique(df_video_faces.index.map(lambda x: int(x.split('_subj')[1].split('.jpg')[0]))))
|
| 136 |
+
df_videos.loc[idx, 'nfaces'] = nfaces
|
| 137 |
+
faces_dataset.append(df_video_faces)
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print('Error while reading: {}'.format(video_face_checkpoint_path))
|
| 140 |
+
print(e)
|
| 141 |
+
video_face_checkpoint_path.unlink()
|
| 142 |
+
|
| 143 |
+
if len(faces_dataset) == 0:
|
| 144 |
+
raise ValueError(f'No checkpoint found from face extraction. '
|
| 145 |
+
f'Is the the source path {source_dir} correct for the videos in your dataframe?')
|
| 146 |
+
|
| 147 |
+
# Save videos with updated faces
|
| 148 |
+
print('Saving videos DataFrame to {}'.format(videodataset_path))
|
| 149 |
+
df_videos.to_pickle(str(videodataset_path))
|
| 150 |
+
|
| 151 |
+
if offset > 0:
|
| 152 |
+
if num > 0:
|
| 153 |
+
if facesdataset_path.is_dir():
|
| 154 |
+
facesdataset_path = facesdataset_path.joinpath(
|
| 155 |
+
'faces_df_from_video_{}_to_video_{}.pkl'.format(offset, num + offset))
|
| 156 |
+
else:
|
| 157 |
+
facesdataset_path = facesdataset_path.parent.joinpath(
|
| 158 |
+
str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}_to_video_{}.pkl'.format(offset,
|
| 159 |
+
num + offset))
|
| 160 |
+
else:
|
| 161 |
+
if facesdataset_path.is_dir():
|
| 162 |
+
facesdataset_path = facesdataset_path.joinpath('faces_df_from_video_{}.pkl'.format(offset))
|
| 163 |
+
else:
|
| 164 |
+
facesdataset_path = facesdataset_path.parent.joinpath(
|
| 165 |
+
str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}.pkl'.format(offset))
|
| 166 |
+
elif num > 0:
|
| 167 |
+
if facesdataset_path.is_dir():
|
| 168 |
+
facesdataset_path = facesdataset_path.joinpath(
|
| 169 |
+
'faces_df_from_video_{}_to_video_{}.pkl'.format(0, num))
|
| 170 |
+
else:
|
| 171 |
+
facesdataset_path = facesdataset_path.parent.joinpath(
|
| 172 |
+
str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}_to_video_{}.pkl'.format(0, num))
|
| 173 |
+
else:
|
| 174 |
+
if facesdataset_path.is_dir():
|
| 175 |
+
facesdataset_path = facesdataset_path.joinpath('faces_df.pkl') # just a check if the path is a dir
|
| 176 |
+
|
| 177 |
+
# Creates directory (if doesn't exist)
|
| 178 |
+
facesdataset_path.parent.mkdir(parents=True, exist_ok=True)
|
| 179 |
+
print('Saving faces DataFrame to {}'.format(facesdataset_path))
|
| 180 |
+
df_faces = pd.concat(faces_dataset, axis=0, )
|
| 181 |
+
df_faces['video'] = df_faces['video'].astype('category')
|
| 182 |
+
for key in ['kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x',
|
| 183 |
+
'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left',
|
| 184 |
+
'top', 'right', 'bottom', ]:
|
| 185 |
+
df_faces[key] = df_faces[key].astype(np.int16)
|
| 186 |
+
df_faces['videosubject'] = df_faces['videosubject'].astype(np.int8)
|
| 187 |
+
# Eventually remove duplicates
|
| 188 |
+
df_faces = df_faces.loc[~df_faces.index.duplicated(keep='first')]
|
| 189 |
+
fields_to_preserve_from_video = [i for i in
|
| 190 |
+
['folder', 'subject', 'scene', 'cluster', 'nfaces', 'test'] if
|
| 191 |
+
i in df_videos]
|
| 192 |
+
df_faces = pd.merge(df_faces, df_videos[fields_to_preserve_from_video], left_on='video',
|
| 193 |
+
right_index=True)
|
| 194 |
+
df_faces.to_pickle(str(facesdataset_path))
|
| 195 |
+
|
| 196 |
+
print('Completed!')
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def save_jpg(args: Tuple[Image.Image, Path or str]):
|
| 200 |
+
image, path = args
|
| 201 |
+
image.save(path, quality=95, subsampling='4:4:4')
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def process_video(item: Tuple[pd.Index, pd.Series],
|
| 205 |
+
source_dir: Path,
|
| 206 |
+
facedestination_dir: Path,
|
| 207 |
+
checkpoint_folder: Path,
|
| 208 |
+
face_size: int,
|
| 209 |
+
face_extractor: FaceExtractor,
|
| 210 |
+
lazycheck: bool = False,
|
| 211 |
+
deepcheck: bool = False,
|
| 212 |
+
) -> (pd.DataFrame, Path, List[Tuple[Image.Image, Path]]) or None:
|
| 213 |
+
# Instatiate Index and Series
|
| 214 |
+
idx, record = item
|
| 215 |
+
|
| 216 |
+
# Checkpoint
|
| 217 |
+
video_faces_checkpoint_path = checkpoint_folder.joinpath(record['path']).with_suffix('.faces.pkl')
|
| 218 |
+
|
| 219 |
+
if not lazycheck:
|
| 220 |
+
if video_faces_checkpoint_path.exists():
|
| 221 |
+
try:
|
| 222 |
+
df_video_faces = pd.read_pickle(str(video_faces_checkpoint_path))
|
| 223 |
+
for _, r in df_video_faces.iterrows():
|
| 224 |
+
face_path = facedestination_dir.joinpath(r.name)
|
| 225 |
+
assert (face_path.exists())
|
| 226 |
+
if deepcheck:
|
| 227 |
+
img = Image.open(face_path)
|
| 228 |
+
img_arr = np.asarray(img)
|
| 229 |
+
assert (img_arr.ndim == 3)
|
| 230 |
+
assert (np.prod(img_arr.shape) > 0)
|
| 231 |
+
except Exception as e:
|
| 232 |
+
print('Error while checking: {}'.format(video_faces_checkpoint_path))
|
| 233 |
+
print(e)
|
| 234 |
+
video_faces_checkpoint_path.unlink()
|
| 235 |
+
|
| 236 |
+
if not (video_faces_checkpoint_path.exists()):
|
| 237 |
+
|
| 238 |
+
try:
|
| 239 |
+
|
| 240 |
+
video_face_dict_list = []
|
| 241 |
+
|
| 242 |
+
# Load faces
|
| 243 |
+
current_video_path = source_dir.joinpath(record['path'])
|
| 244 |
+
if not current_video_path.exists():
|
| 245 |
+
raise FileNotFoundError(f'Unable to find {current_video_path}.'
|
| 246 |
+
f'Are you sure that {source_dir} is the correct source directory for the video '
|
| 247 |
+
f'you indexed in the dataframe?')
|
| 248 |
+
|
| 249 |
+
frames = face_extractor.process_video(current_video_path)
|
| 250 |
+
|
| 251 |
+
if len(frames) == 0:
|
| 252 |
+
return
|
| 253 |
+
|
| 254 |
+
face_extractor.keep_only_best_face(frames)
|
| 255 |
+
for frame_idx, frame in enumerate(frames):
|
| 256 |
+
frames[frame_idx]['subjects'] = [0] * len(frames[frame_idx]['detections'])
|
| 257 |
+
|
| 258 |
+
# Extract and save faces, bounding boxes, keypoints
|
| 259 |
+
images_to_save: List[Tuple[Image.Image, Path]] = []
|
| 260 |
+
for frame_idx, frame in enumerate(frames):
|
| 261 |
+
if len(frames[frame_idx]['detections']):
|
| 262 |
+
fullframe = Image.fromarray(frames[frame_idx]['frame'])
|
| 263 |
+
|
| 264 |
+
# Preserve the only found face even if not a good one, otherwise preserve only clusters > -1
|
| 265 |
+
subjects = np.unique(frames[frame_idx]['subjects'])
|
| 266 |
+
if len(subjects) > 1:
|
| 267 |
+
subjects = np.asarray([s for s in subjects if s > -1])
|
| 268 |
+
|
| 269 |
+
for face_idx, _ in enumerate(frame['faces']):
|
| 270 |
+
subj_id = frames[frame_idx]['subjects'][face_idx]
|
| 271 |
+
if subj_id in subjects: # Exclude outliers if other faces detected
|
| 272 |
+
face_path = facedestination_dir.joinpath(record['path'], 'fr{:03d}_subj{:1d}.jpg'.format(
|
| 273 |
+
frames[frame_idx]['frame_idx'], subj_id))
|
| 274 |
+
|
| 275 |
+
face_dict = {'facepath': str(face_path.relative_to(facedestination_dir)), 'video': idx,
|
| 276 |
+
'label': record['label'], 'videosubject': subj_id,
|
| 277 |
+
'original': record['original']}
|
| 278 |
+
# add attibutes for ff++
|
| 279 |
+
if 'class' in record.keys():
|
| 280 |
+
face_dict.update({'class': record['class']})
|
| 281 |
+
if 'source' in record.keys():
|
| 282 |
+
face_dict.update({'source': record['source']})
|
| 283 |
+
if 'quality' in record.keys():
|
| 284 |
+
face_dict.update({'quality': record['quality']})
|
| 285 |
+
|
| 286 |
+
for field_idx, key in enumerate(blazeface.BlazeFace.detection_keys):
|
| 287 |
+
face_dict[key] = frames[frame_idx]['detections'][face_idx][field_idx]
|
| 288 |
+
|
| 289 |
+
cropping_bb = adapt_bb(frame_height=fullframe.height,
|
| 290 |
+
frame_width=fullframe.width,
|
| 291 |
+
bb_height=face_size,
|
| 292 |
+
bb_width=face_size,
|
| 293 |
+
left=face_dict['xmin'],
|
| 294 |
+
top=face_dict['ymin'],
|
| 295 |
+
right=face_dict['xmax'],
|
| 296 |
+
bottom=face_dict['ymax'])
|
| 297 |
+
face = fullframe.crop(cropping_bb)
|
| 298 |
+
|
| 299 |
+
for key in blazeface.BlazeFace.detection_keys:
|
| 300 |
+
if (key[0] == 'k' and key[-1] == 'x') or (key[0] == 'x'):
|
| 301 |
+
face_dict[key] -= cropping_bb[0]
|
| 302 |
+
elif (key[0] == 'k' and key[-1] == 'y') or (key[0] == 'y'):
|
| 303 |
+
face_dict[key] -= cropping_bb[1]
|
| 304 |
+
|
| 305 |
+
face_dict['left'] = face_dict.pop('xmin')
|
| 306 |
+
face_dict['top'] = face_dict.pop('ymin')
|
| 307 |
+
face_dict['right'] = face_dict.pop('xmax')
|
| 308 |
+
face_dict['bottom'] = face_dict.pop('ymax')
|
| 309 |
+
|
| 310 |
+
face_path.parent.mkdir(parents=True, exist_ok=True)
|
| 311 |
+
images_to_save.append((face, face_path))
|
| 312 |
+
|
| 313 |
+
video_face_dict_list.append(face_dict)
|
| 314 |
+
|
| 315 |
+
if len(video_face_dict_list) > 0:
|
| 316 |
+
|
| 317 |
+
df_video_faces = pd.DataFrame(video_face_dict_list)
|
| 318 |
+
df_video_faces.index = df_video_faces['facepath']
|
| 319 |
+
del df_video_faces['facepath']
|
| 320 |
+
|
| 321 |
+
# type conversions
|
| 322 |
+
for key in ['kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y',
|
| 323 |
+
'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left', 'top',
|
| 324 |
+
'right', 'bottom']:
|
| 325 |
+
df_video_faces[key] = df_video_faces[key].astype(np.int16)
|
| 326 |
+
df_video_faces['conf'] = df_video_faces['conf'].astype(np.float32)
|
| 327 |
+
df_video_faces['video'] = df_video_faces['video'].astype('category')
|
| 328 |
+
|
| 329 |
+
video_faces_checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
|
| 330 |
+
|
| 331 |
+
else:
|
| 332 |
+
print('No faces extracted for video {}'.format(record['path']))
|
| 333 |
+
df_video_faces = pd.DataFrame()
|
| 334 |
+
|
| 335 |
+
return df_video_faces, video_faces_checkpoint_path, images_to_save
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
print('Error while processing: {}'.format(record['path']))
|
| 339 |
+
print("-" * 60)
|
| 340 |
+
traceback.print_exc(file=sys.stdout, limit=5)
|
| 341 |
+
print("-" * 60)
|
| 342 |
+
return
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
if __name__ == '__main__':
|
| 346 |
+
main(sys.argv[1:])
|
models/icpr2020dfdc/index_celebdf.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Index Celeb-DF v2
|
| 3 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 4 |
+
Nicolò Bonettini
|
| 5 |
+
Edoardo Daniele Cannas
|
| 6 |
+
Sara Mandelli
|
| 7 |
+
Luca Bondi
|
| 8 |
+
Paolo Bestagini
|
| 9 |
+
"""
|
| 10 |
+
import argparse
|
| 11 |
+
from multiprocessing import Pool
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
import numpy as np
|
| 15 |
+
import pandas as pd
|
| 16 |
+
|
| 17 |
+
from isplutils.utils import extract_meta_av, extract_meta_cv
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def main():
|
| 21 |
+
parser = argparse.ArgumentParser()
|
| 22 |
+
parser.add_argument('--source', type=Path, help='Source dir',
|
| 23 |
+
required=True)
|
| 24 |
+
parser.add_argument('--videodataset', type=Path, default='data/celebdf_videos.pkl',
|
| 25 |
+
help='Path to save the videos DataFrame')
|
| 26 |
+
|
| 27 |
+
args = parser.parse_args()
|
| 28 |
+
|
| 29 |
+
## Parameters parsing
|
| 30 |
+
source_dir: Path = args.source
|
| 31 |
+
videodataset_path: Path = args.videodataset
|
| 32 |
+
|
| 33 |
+
# Create ouput folder (if doesn't exist)
|
| 34 |
+
videodataset_path.parent.mkdir(parents=True, exist_ok=True)
|
| 35 |
+
|
| 36 |
+
## DataFrame
|
| 37 |
+
if videodataset_path.exists():
|
| 38 |
+
print('Loading video DataFrame')
|
| 39 |
+
df_videos = pd.read_pickle(videodataset_path)
|
| 40 |
+
else:
|
| 41 |
+
print('Creating video DataFrame')
|
| 42 |
+
|
| 43 |
+
split_file = Path(source_dir).joinpath('List_of_testing_videos.txt')
|
| 44 |
+
if not split_file.exists():
|
| 45 |
+
raise FileNotFoundError('Unable to find "List_of_testing_videos.txt" in {}'.format(source_dir))
|
| 46 |
+
test_videos_df = pd.read_csv(split_file, delimiter=' ', header=0, index_col=1)
|
| 47 |
+
|
| 48 |
+
ff_videos = Path(source_dir).rglob('*.mp4')
|
| 49 |
+
df_videos = pd.DataFrame(
|
| 50 |
+
{'path': [f.relative_to(source_dir) for f in ff_videos]})
|
| 51 |
+
|
| 52 |
+
df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
|
| 53 |
+
with Pool() as p:
|
| 54 |
+
meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
|
| 55 |
+
meta = np.stack(meta)
|
| 56 |
+
df_videos.loc[:, ['height', 'width', 'frames']] = meta
|
| 57 |
+
|
| 58 |
+
# Fix for videos that av cannot decode properly
|
| 59 |
+
for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
|
| 60 |
+
meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
|
| 61 |
+
df_videos.loc[idx, ['height', 'width', 'frames']] = meta
|
| 62 |
+
|
| 63 |
+
df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
|
| 64 |
+
df_videos['label'] = df_videos['class'].map(
|
| 65 |
+
lambda x: True if x == 'Celeb-synthesis' else False) # True is FAKE, False is REAL
|
| 66 |
+
df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').name)
|
| 67 |
+
|
| 68 |
+
df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
|
| 69 |
+
df_videos.loc[(df_videos['label'] == True), 'original'] = \
|
| 70 |
+
df_videos[(df_videos['label'] == True)]['name'].map(
|
| 71 |
+
lambda x: df_videos.index[
|
| 72 |
+
np.flatnonzero(df_videos['name'] == '_'.join([x.split('_')[0], x.split('_')[2]]))[0]]
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
df_videos['test'] = df_videos['path'].map(str).isin(test_videos_df.index)
|
| 76 |
+
|
| 77 |
+
print('Saving video DataFrame to {}'.format(videodataset_path))
|
| 78 |
+
df_videos.to_pickle(str(videodataset_path))
|
| 79 |
+
|
| 80 |
+
print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
|
| 81 |
+
print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == '__main__':
|
| 85 |
+
main()
|
models/icpr2020dfdc/index_dfdc.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Index the official Kaggle training dataset and prepares a train and validation set based on folders
|
| 3 |
+
|
| 4 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 5 |
+
|
| 6 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 7 |
+
|
| 8 |
+
Nicolò Bonettini
|
| 9 |
+
Edoardo Daniele Cannas
|
| 10 |
+
Sara Mandelli
|
| 11 |
+
Luca Bondi
|
| 12 |
+
Paolo Bestagini
|
| 13 |
+
"""
|
| 14 |
+
import sys
|
| 15 |
+
import argparse
|
| 16 |
+
from multiprocessing import Pool
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
import pandas as pd
|
| 21 |
+
from tqdm import tqdm
|
| 22 |
+
|
| 23 |
+
from isplutils.utils import extract_meta_av
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def parse_args(argv):
|
| 27 |
+
parser = argparse.ArgumentParser()
|
| 28 |
+
parser.add_argument('--source', type=Path, help='Source dir', required=True)
|
| 29 |
+
parser.add_argument('--videodataset', type=Path, default='data/dfdc_videos.pkl',
|
| 30 |
+
help='Path to save the videos DataFrame')
|
| 31 |
+
parser.add_argument('--batch', type=int, help='Batch size', default=64)
|
| 32 |
+
|
| 33 |
+
return parser.parse_args(argv)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def main(argv):
|
| 37 |
+
## Parameters parsing
|
| 38 |
+
args = parse_args(argv)
|
| 39 |
+
source_dir: Path = args.source
|
| 40 |
+
videodataset_path: Path = args.videodataset
|
| 41 |
+
batch_size: int = args.batch
|
| 42 |
+
|
| 43 |
+
## DataFrame
|
| 44 |
+
if videodataset_path.exists():
|
| 45 |
+
print('Loading video DataFrame')
|
| 46 |
+
df_videos = pd.read_pickle(videodataset_path)
|
| 47 |
+
else:
|
| 48 |
+
print('Creating video DataFrame')
|
| 49 |
+
|
| 50 |
+
# Create ouptut folder
|
| 51 |
+
videodataset_path.parent.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
# Index
|
| 54 |
+
df_train_list = list()
|
| 55 |
+
for idx, json_path in enumerate(tqdm(sorted(source_dir.rglob('metadata.json')), desc='Indexing')):
|
| 56 |
+
df_tmp = pd.read_json(json_path, orient='index')
|
| 57 |
+
df_tmp['path'] = df_tmp.index.map(
|
| 58 |
+
lambda x: str(json_path.parent.relative_to(source_dir).joinpath(x)))
|
| 59 |
+
df_tmp['folder'] = int(str(json_path.parts[-2]).split('_')[-1])
|
| 60 |
+
df_train_list.append(df_tmp)
|
| 61 |
+
df_videos = pd.concat(df_train_list, axis=0, verify_integrity=True)
|
| 62 |
+
|
| 63 |
+
# Save space
|
| 64 |
+
del df_videos['split']
|
| 65 |
+
df_videos['label'] = df_videos['label'] == 'FAKE'
|
| 66 |
+
df_videos['original'] = df_videos['original'].astype('category')
|
| 67 |
+
df_videos['folder'] = df_videos['folder'].astype(np.uint8)
|
| 68 |
+
|
| 69 |
+
# Collect metadata
|
| 70 |
+
paths_arr = np.asarray(df_videos.path.map(lambda x: str(source_dir.joinpath(x))))
|
| 71 |
+
height_list = []
|
| 72 |
+
width_list = []
|
| 73 |
+
frames_list = []
|
| 74 |
+
with Pool() as pool:
|
| 75 |
+
for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos), step=batch_size), desc='Metadata'):
|
| 76 |
+
batch_res = pool.map(extract_meta_av, paths_arr[batch_idx0:batch_idx0 + batch_size])
|
| 77 |
+
for res in batch_res:
|
| 78 |
+
height_list.append(res[0])
|
| 79 |
+
width_list.append(res[1])
|
| 80 |
+
frames_list.append(res[2])
|
| 81 |
+
|
| 82 |
+
df_videos['height'] = np.asarray(height_list, dtype=np.uint16)
|
| 83 |
+
df_videos['width'] = np.asarray(width_list, dtype=np.uint16)
|
| 84 |
+
df_videos['frames'] = np.asarray(frames_list, dtype=np.uint16)
|
| 85 |
+
|
| 86 |
+
print('Saving video DataFrame to {}'.format(videodataset_path))
|
| 87 |
+
df_videos.to_pickle(str(videodataset_path))
|
| 88 |
+
|
| 89 |
+
print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
|
| 90 |
+
print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == '__main__':
|
| 94 |
+
main(sys.argv[1:])
|
models/icpr2020dfdc/index_ffpp.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Index FaceForensics++
|
| 3 |
+
|
| 4 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 5 |
+
|
| 6 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 7 |
+
|
| 8 |
+
Nicolò Bonettini
|
| 9 |
+
Edoardo Daniele Cannas
|
| 10 |
+
Sara Mandelli
|
| 11 |
+
Luca Bondi
|
| 12 |
+
Paolo Bestagini
|
| 13 |
+
"""
|
| 14 |
+
import argparse
|
| 15 |
+
import sys
|
| 16 |
+
from multiprocessing import Pool
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
import pandas as pd
|
| 21 |
+
|
| 22 |
+
from isplutils.utils import extract_meta_av, extract_meta_cv
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def parse_args(argv):
|
| 26 |
+
parser = argparse.ArgumentParser()
|
| 27 |
+
parser.add_argument('--source', type=Path, help='Source dir',
|
| 28 |
+
default='dataset/ffpp/faceforensics')
|
| 29 |
+
parser.add_argument('--videodataset', type=Path, default='data/ffpp_videos.pkl',
|
| 30 |
+
help='Path to save the videos DataFrame')
|
| 31 |
+
|
| 32 |
+
return parser.parse_args(argv)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def main(argv):
|
| 36 |
+
## Parameters parsing
|
| 37 |
+
args = parse_args(argv)
|
| 38 |
+
source_dir: Path = args.source
|
| 39 |
+
videodataset_path: Path = args.videodataset
|
| 40 |
+
|
| 41 |
+
# Create ouput folder (if doesn't exist)
|
| 42 |
+
videodataset_path.parent.mkdir(parents=True, exist_ok=True)
|
| 43 |
+
|
| 44 |
+
## DataFrame
|
| 45 |
+
if videodataset_path.exists():
|
| 46 |
+
print('Loading video DataFrame')
|
| 47 |
+
df_videos = pd.read_pickle(videodataset_path)
|
| 48 |
+
else:
|
| 49 |
+
print('Creating video DataFrame')
|
| 50 |
+
|
| 51 |
+
ff_videos = Path(source_dir).rglob('*.mp4')
|
| 52 |
+
df_videos = pd.DataFrame(
|
| 53 |
+
{'path': [f.relative_to(source_dir) for f in ff_videos if 'mask' not in str(f) and 'raw' not in str(f)]})
|
| 54 |
+
|
| 55 |
+
df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
|
| 56 |
+
with Pool() as p:
|
| 57 |
+
meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
|
| 58 |
+
meta = np.stack(meta)
|
| 59 |
+
df_videos.loc[:, ['height', 'width', 'frames']] = meta
|
| 60 |
+
|
| 61 |
+
# Fix for videos that av cannot decode properly
|
| 62 |
+
for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
|
| 63 |
+
meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
|
| 64 |
+
df_videos.loc[idx, ['height', 'width', 'frames']] = meta
|
| 65 |
+
|
| 66 |
+
df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
|
| 67 |
+
df_videos['label'] = df_videos['class'].map(
|
| 68 |
+
lambda x: True if x == 'manipulated_sequences' else False) # True is FAKE, False is REAL
|
| 69 |
+
df_videos['source'] = df_videos['path'].map(lambda x: x.parts[1]).astype('category')
|
| 70 |
+
df_videos['quality'] = df_videos['path'].map(lambda x: x.parts[2]).astype('category')
|
| 71 |
+
df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').parts[-1])
|
| 72 |
+
|
| 73 |
+
df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
|
| 74 |
+
df_videos.loc[(df_videos['label'] == True) & (df_videos['source'] != 'DeepFakeDetection'), 'original'] = \
|
| 75 |
+
df_videos[(df_videos['label'] == True) & (df_videos['source'] != 'DeepFakeDetection')]['name'].map(
|
| 76 |
+
lambda x: df_videos.index[np.flatnonzero(df_videos['name'] == x.split('_')[0])[0]]
|
| 77 |
+
)
|
| 78 |
+
df_videos.loc[(df_videos['label'] == True) & (df_videos['source'] == 'DeepFakeDetection'), 'original'] = \
|
| 79 |
+
df_videos[(df_videos['label'] == True) & (df_videos['source'] == 'DeepFakeDetection')]['name'].map(
|
| 80 |
+
lambda x: df_videos.index[
|
| 81 |
+
np.flatnonzero(df_videos['name'] == x.split('_')[0] + '__' + x.split('__')[1])[0]]
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
print('Saving video DataFrame to {}'.format(videodataset_path))
|
| 85 |
+
df_videos.to_pickle(str(videodataset_path))
|
| 86 |
+
|
| 87 |
+
print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
|
| 88 |
+
print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
if __name__ == '__main__':
|
| 92 |
+
main(sys.argv[1:])
|
models/icpr2020dfdc/isplutils/__init__.py
ADDED
|
File without changes
|
models/icpr2020dfdc/isplutils/data.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
import os
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import List
|
| 15 |
+
|
| 16 |
+
import albumentations as A
|
| 17 |
+
import numpy as np
|
| 18 |
+
import pandas as pd
|
| 19 |
+
import torch
|
| 20 |
+
from PIL import Image
|
| 21 |
+
from albumentations.pytorch import ToTensorV2
|
| 22 |
+
from torch.utils.data import Dataset, IterableDataset
|
| 23 |
+
|
| 24 |
+
from .utils import extract_bb
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def load_face(record: pd.Series, root: str, size: int, scale: str, transformer: A.BasicTransform) -> torch.Tensor:
|
| 28 |
+
path = os.path.join(str(root), str(record.name))
|
| 29 |
+
autocache = size < 256 or scale == 'tight'
|
| 30 |
+
if scale in ['crop', 'scale', ]:
|
| 31 |
+
cached_path = str(Path(root).joinpath('autocache', scale, str(size), str(record.name)).with_suffix('.jpg'))
|
| 32 |
+
else:
|
| 33 |
+
# when self.scale == 'tight' the extracted face is not dependent on size
|
| 34 |
+
cached_path = str(Path(root).joinpath('autocache', scale, str(record.name)).with_suffix('.jpg'))
|
| 35 |
+
|
| 36 |
+
face = np.zeros((size, size, 3), dtype=np.uint8)
|
| 37 |
+
if os.path.exists(cached_path):
|
| 38 |
+
try:
|
| 39 |
+
face = Image.open(cached_path)
|
| 40 |
+
face = np.array(face)
|
| 41 |
+
if len(face.shape) != 3:
|
| 42 |
+
raise RuntimeError('Incorrect format: {}'.format(path))
|
| 43 |
+
except KeyboardInterrupt as e:
|
| 44 |
+
# We want keybord interrupts to be propagated
|
| 45 |
+
raise e
|
| 46 |
+
except (OSError, IOError) as e:
|
| 47 |
+
print('Deleting corrupted cache file: {}'.format(cached_path))
|
| 48 |
+
print(e)
|
| 49 |
+
os.unlink(cached_path)
|
| 50 |
+
face = np.zeros((size, size, 3), dtype=np.uint8)
|
| 51 |
+
|
| 52 |
+
if not os.path.exists(cached_path):
|
| 53 |
+
try:
|
| 54 |
+
frame = Image.open(path)
|
| 55 |
+
bb = record['left'], record['top'], record['right'], record['bottom']
|
| 56 |
+
face = extract_bb(frame, bb=bb, size=size, scale=scale)
|
| 57 |
+
|
| 58 |
+
if autocache:
|
| 59 |
+
os.makedirs(os.path.dirname(cached_path), exist_ok=True)
|
| 60 |
+
face.save(cached_path, quality=95, subsampling='4:4:4')
|
| 61 |
+
|
| 62 |
+
face = np.array(face)
|
| 63 |
+
if len(face.shape) != 3:
|
| 64 |
+
raise RuntimeError('Incorrect format: {}'.format(path))
|
| 65 |
+
except KeyboardInterrupt as e:
|
| 66 |
+
# We want keybord interrupts to be propagated
|
| 67 |
+
raise e
|
| 68 |
+
except (OSError, IOError) as e:
|
| 69 |
+
print('Error while reading: {}'.format(path))
|
| 70 |
+
print(e)
|
| 71 |
+
face = np.zeros((size, size, 3), dtype=np.uint8)
|
| 72 |
+
|
| 73 |
+
face = transformer(image=face)['image']
|
| 74 |
+
|
| 75 |
+
return face
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class FrameFaceIterableDataset(IterableDataset):
|
| 79 |
+
|
| 80 |
+
def __init__(self,
|
| 81 |
+
roots: List[str],
|
| 82 |
+
dfs: List[pd.DataFrame],
|
| 83 |
+
size: int, scale: str,
|
| 84 |
+
num_samples: int = -1,
|
| 85 |
+
transformer: A.BasicTransform = ToTensorV2(),
|
| 86 |
+
output_index: bool = False,
|
| 87 |
+
labels_map: dict = None,
|
| 88 |
+
seed: int = None):
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
:param roots: List of root folders for frames cache
|
| 92 |
+
:param dfs: List of DataFrames of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
|
| 93 |
+
and 'label' column
|
| 94 |
+
:param size: face size
|
| 95 |
+
:param num_samples:
|
| 96 |
+
:param scale: Rescale the face to the given size, preserving the aspect ratio.
|
| 97 |
+
If false crop around center to the given size
|
| 98 |
+
:param transformer:
|
| 99 |
+
:param output_index: enable output of df_frames index
|
| 100 |
+
:param labels_map: map from 'REAL' and 'FAKE' to actual labels
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
self.dfs = dfs
|
| 104 |
+
self.size = int(size)
|
| 105 |
+
|
| 106 |
+
self.seed0 = int(seed) if seed is not None else np.random.choice(2 ** 32)
|
| 107 |
+
|
| 108 |
+
# adapt indices
|
| 109 |
+
dfs_adapted = [df.copy() for df in self.dfs]
|
| 110 |
+
for df_idx, df in enumerate(dfs_adapted):
|
| 111 |
+
mi = pd.MultiIndex.from_tuples([(df_idx, key) for key in df.index], names=['df_idx', 'df_key'])
|
| 112 |
+
df.index = mi
|
| 113 |
+
# Concat
|
| 114 |
+
self.df = pd.concat(dfs_adapted, axis=0, join='inner')
|
| 115 |
+
|
| 116 |
+
self.df_real = self.df[self.df['label'] == 0]
|
| 117 |
+
self.df_fake = self.df[self.df['label'] == 1]
|
| 118 |
+
|
| 119 |
+
self.longer_set = 'real' if len(self.df_real) > len(self.df_fake) else 'fake'
|
| 120 |
+
self.num_samples = max(len(self.df_real), len(self.df_fake)) * 2
|
| 121 |
+
self.num_samples = min(self.num_samples, num_samples) if num_samples > 0 else self.num_samples
|
| 122 |
+
|
| 123 |
+
self.output_idx = bool(output_index)
|
| 124 |
+
|
| 125 |
+
self.scale = str(scale)
|
| 126 |
+
self.roots = [str(r) for r in roots]
|
| 127 |
+
self.transformer = transformer
|
| 128 |
+
|
| 129 |
+
self.labels_map = labels_map
|
| 130 |
+
if self.labels_map is None:
|
| 131 |
+
self.labels_map = {False: np.array([0., ]), True: np.array([1., ])}
|
| 132 |
+
else:
|
| 133 |
+
self.labels_map = dict(self.labels_map)
|
| 134 |
+
|
| 135 |
+
def _get_face(self, item: pd.Index) -> (torch.Tensor, torch.Tensor) or (torch.Tensor, torch.Tensor, str):
|
| 136 |
+
|
| 137 |
+
record = self.dfs[item[0]].loc[item[1]]
|
| 138 |
+
face = load_face(record=record,
|
| 139 |
+
root=self.roots[item[0]],
|
| 140 |
+
size=self.size,
|
| 141 |
+
scale=self.scale,
|
| 142 |
+
transformer=self.transformer)
|
| 143 |
+
|
| 144 |
+
label = self.labels_map[record.label]
|
| 145 |
+
if self.output_idx:
|
| 146 |
+
return face, label, record.name
|
| 147 |
+
else:
|
| 148 |
+
return face, label
|
| 149 |
+
|
| 150 |
+
def __len__(self):
|
| 151 |
+
return self.num_samples
|
| 152 |
+
|
| 153 |
+
def __iter__(self):
|
| 154 |
+
|
| 155 |
+
random_fake_idxs, random_real_idxs = get_iterative_real_fake_idxs(
|
| 156 |
+
df_real=self.df_real,
|
| 157 |
+
df_fake=self.df_fake,
|
| 158 |
+
num_samples=self.num_samples,
|
| 159 |
+
seed0=self.seed0
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
while len(random_fake_idxs) >= 1 and len(random_real_idxs) >= 1:
|
| 163 |
+
yield self._get_face(random_fake_idxs.pop())
|
| 164 |
+
yield self._get_face(random_real_idxs.pop())
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def get_iterative_real_fake_idxs(df_real: pd.DataFrame, df_fake: pd.DataFrame,
|
| 168 |
+
num_samples: int, seed0: int):
|
| 169 |
+
longer_set = 'real' if len(df_real) > len(df_fake) else 'fake'
|
| 170 |
+
worker_info = torch.utils.data.get_worker_info()
|
| 171 |
+
if worker_info is None:
|
| 172 |
+
seed = seed0
|
| 173 |
+
np.random.seed(seed)
|
| 174 |
+
worker_num_couple_samples = num_samples // 2
|
| 175 |
+
fake_idxs_portion = np.random.choice(df_fake.index, worker_num_couple_samples,
|
| 176 |
+
replace=longer_set == 'real')
|
| 177 |
+
real_idxs_portion = np.random.choice(df_real.index, worker_num_couple_samples,
|
| 178 |
+
replace=longer_set == 'fake')
|
| 179 |
+
else:
|
| 180 |
+
worker_id = worker_info.id
|
| 181 |
+
seed = seed0 + worker_id
|
| 182 |
+
np.random.seed(seed)
|
| 183 |
+
worker_num_couple_samples = (num_samples // 2) // worker_info.num_workers
|
| 184 |
+
if longer_set == 'fake':
|
| 185 |
+
fake_idxs_portion = df_fake.index[
|
| 186 |
+
worker_id * worker_num_couple_samples:(worker_id + 1) * worker_num_couple_samples]
|
| 187 |
+
real_idxs_portion = np.random.choice(df_real.index, worker_num_couple_samples, replace=True)
|
| 188 |
+
else:
|
| 189 |
+
real_idxs_portion = df_real.index[
|
| 190 |
+
worker_id * worker_num_couple_samples:(worker_id + 1) * worker_num_couple_samples]
|
| 191 |
+
fake_idxs_portion = np.random.choice(df_fake.index, worker_num_couple_samples,
|
| 192 |
+
replace=True)
|
| 193 |
+
random_fake_idxs = list(np.random.permutation(fake_idxs_portion))
|
| 194 |
+
random_real_idxs = list(np.random.permutation(real_idxs_portion))
|
| 195 |
+
|
| 196 |
+
assert (len(random_fake_idxs) == len(random_real_idxs))
|
| 197 |
+
|
| 198 |
+
return random_fake_idxs, random_real_idxs
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
class FrameFaceDatasetTest(Dataset):
|
| 202 |
+
|
| 203 |
+
def __init__(self, root: str, df: pd.DataFrame,
|
| 204 |
+
size: int, scale: str,
|
| 205 |
+
transformer: A.BasicTransform = ToTensorV2(),
|
| 206 |
+
labels_map: dict = None,
|
| 207 |
+
aug_transformers: List[A.BasicTransform] = None):
|
| 208 |
+
"""
|
| 209 |
+
|
| 210 |
+
:param root: root folder for frames cache
|
| 211 |
+
:param df: DataFrame of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
|
| 212 |
+
and 'label' column
|
| 213 |
+
:param size: face size
|
| 214 |
+
:param num_samples:
|
| 215 |
+
:param scale: Rescale the face to the given size, preserving the aspect ratio.
|
| 216 |
+
If false crop around center to the given size
|
| 217 |
+
:param transformer:
|
| 218 |
+
:param labels_map: dcit to map df labels
|
| 219 |
+
:param aug_transformers: if not None, creates multiple copies of the same sample according to the provided augmentations
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
self.df = df
|
| 223 |
+
self.size = int(size)
|
| 224 |
+
|
| 225 |
+
self.scale = str(scale)
|
| 226 |
+
self.root = str(root)
|
| 227 |
+
self.transformer = transformer
|
| 228 |
+
self.aug_transformers = aug_transformers
|
| 229 |
+
|
| 230 |
+
self.labels_map = labels_map
|
| 231 |
+
if self.labels_map is None:
|
| 232 |
+
self.labels_map = {False: np.array([0., ]), True: np.array([1., ])}
|
| 233 |
+
else:
|
| 234 |
+
self.labels_map = dict(self.labels_map)
|
| 235 |
+
|
| 236 |
+
def _get_face(self, item: pd.Index) -> (torch.Tensor, torch.Tensor) or (torch.Tensor, torch.Tensor, str):
|
| 237 |
+
record = self.df.loc[item]
|
| 238 |
+
label = self.labels_map[record.label]
|
| 239 |
+
if self.aug_transformers is None:
|
| 240 |
+
face = load_face(record=record,
|
| 241 |
+
root=self.root,
|
| 242 |
+
size=self.size,
|
| 243 |
+
scale=self.scale,
|
| 244 |
+
transformer=self.transformer)
|
| 245 |
+
return face, label
|
| 246 |
+
else:
|
| 247 |
+
faces = []
|
| 248 |
+
for aug_transf in self.aug_transformers:
|
| 249 |
+
faces.append(
|
| 250 |
+
load_face(record=record,
|
| 251 |
+
root=self.root,
|
| 252 |
+
size=self.size,
|
| 253 |
+
scale=self.scale,
|
| 254 |
+
transformer=A.Compose([aug_transf, self.transformer])
|
| 255 |
+
))
|
| 256 |
+
faces = torch.stack(faces)
|
| 257 |
+
return faces, label
|
| 258 |
+
|
| 259 |
+
def __len__(self):
|
| 260 |
+
return len(self.df)
|
| 261 |
+
|
| 262 |
+
def __getitem__(self, item):
|
| 263 |
+
return self._get_face(self.df.index[item])
|
models/icpr2020dfdc/isplutils/data_siamese.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
from typing import List
|
| 13 |
+
|
| 14 |
+
import albumentations as A
|
| 15 |
+
import pandas as pd
|
| 16 |
+
from albumentations.pytorch import ToTensorV2
|
| 17 |
+
|
| 18 |
+
from .data import FrameFaceIterableDataset, get_iterative_real_fake_idxs
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class FrameFaceTripletIterableDataset(FrameFaceIterableDataset):
|
| 22 |
+
|
| 23 |
+
def __init__(self,
|
| 24 |
+
roots: List[str],
|
| 25 |
+
dfs: List[pd.DataFrame],
|
| 26 |
+
size: int,
|
| 27 |
+
scale: str,
|
| 28 |
+
num_triplets: int = -1,
|
| 29 |
+
transformer: A.BasicTransform = ToTensorV2(),
|
| 30 |
+
seed: int = None):
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
:param roots: List of root folders for frames cache
|
| 34 |
+
:param dfs: List of DataFrames of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
|
| 35 |
+
and 'label' column
|
| 36 |
+
:param size: face size
|
| 37 |
+
:param num_triplets: number of samples for the dataset
|
| 38 |
+
:param idxs: sampling indexes triplets (each element is a key for anchor, positive, negative)
|
| 39 |
+
:param scale: Rescale the face to the given size, preserving the aspect ratio.
|
| 40 |
+
If false crop around center to the given size
|
| 41 |
+
:param transformer:
|
| 42 |
+
:param seed:
|
| 43 |
+
"""
|
| 44 |
+
super(FrameFaceTripletIterableDataset, self).__init__(
|
| 45 |
+
roots=roots,
|
| 46 |
+
dfs=dfs,
|
| 47 |
+
size=size,
|
| 48 |
+
scale=scale,
|
| 49 |
+
num_samples=num_triplets * 3,
|
| 50 |
+
transformer=transformer,
|
| 51 |
+
seed=seed
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
self.num_triplet_couples = self.num_samples // 6
|
| 55 |
+
self.num_triplets = self.num_triplet_couples * 2
|
| 56 |
+
self.num_samples = self.num_triplets * 3
|
| 57 |
+
|
| 58 |
+
def __len__(self):
|
| 59 |
+
return self.num_triplets
|
| 60 |
+
|
| 61 |
+
def __iter__(self):
|
| 62 |
+
random_fake_idxs, random_real_idxs = get_iterative_real_fake_idxs(
|
| 63 |
+
df_real=self.df_real,
|
| 64 |
+
df_fake=self.df_fake,
|
| 65 |
+
num_samples=self.num_samples,
|
| 66 |
+
seed0=self.seed0
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
while len(random_fake_idxs) >= 3 and len(random_real_idxs) >= 3:
|
| 70 |
+
a = self._get_face(random_fake_idxs.pop())[0]
|
| 71 |
+
p = self._get_face(random_fake_idxs.pop())[0]
|
| 72 |
+
n = self._get_face(random_real_idxs.pop())[0]
|
| 73 |
+
yield a, p, n
|
| 74 |
+
|
| 75 |
+
a = self._get_face(random_real_idxs.pop())[0]
|
| 76 |
+
p = self._get_face(random_real_idxs.pop())[0]
|
| 77 |
+
n = self._get_face(random_fake_idxs.pop())[0]
|
| 78 |
+
yield a, p, n
|
models/icpr2020dfdc/isplutils/split.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Tuple
|
| 2 |
+
"""
|
| 3 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 4 |
+
|
| 5 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 6 |
+
|
| 7 |
+
Nicolò Bonettini
|
| 8 |
+
Edoardo Daniele Cannas
|
| 9 |
+
Sara Mandelli
|
| 10 |
+
Luca Bondi
|
| 11 |
+
Paolo Bestagini
|
| 12 |
+
"""
|
| 13 |
+
import numpy as np
|
| 14 |
+
import pandas as pd
|
| 15 |
+
|
| 16 |
+
available_datasets = [
|
| 17 |
+
'dfdc-35-5-10',
|
| 18 |
+
'ff-c23-720-140-140',
|
| 19 |
+
'ff-c23-720-140-140-5fpv',
|
| 20 |
+
'ff-c23-720-140-140-10fpv',
|
| 21 |
+
'ff-c23-720-140-140-15fpv',
|
| 22 |
+
'ff-c23-720-140-140-20fpv',
|
| 23 |
+
'ff-c23-720-140-140-25fpv',
|
| 24 |
+
'celebdf', # just for convenience, not used in the original paper
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def load_df(dfdc_df_path: str, ffpp_df_path: str, dfdc_faces_dir: str, ffpp_faces_dir: str, dataset: str) -> (pd.DataFrame, str):
|
| 29 |
+
if dataset.startswith('dfdc'):
|
| 30 |
+
df = pd.read_pickle(dfdc_df_path)
|
| 31 |
+
root = dfdc_faces_dir
|
| 32 |
+
elif dataset.startswith('ff-'):
|
| 33 |
+
df = pd.read_pickle(ffpp_df_path)
|
| 34 |
+
root = ffpp_faces_dir
|
| 35 |
+
else:
|
| 36 |
+
raise NotImplementedError('Unknown dataset: {}'.format(dataset))
|
| 37 |
+
return df, root
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_split_df(df: pd.DataFrame, dataset: str, split: str) -> pd.DataFrame:
|
| 41 |
+
if dataset == 'dfdc-35-5-10':
|
| 42 |
+
if split == 'train':
|
| 43 |
+
split_df = df[df['folder'].isin(range(35))]
|
| 44 |
+
elif split == 'val':
|
| 45 |
+
split_df = df[df['folder'].isin(range(35, 40))]
|
| 46 |
+
elif split == 'test':
|
| 47 |
+
split_df = df[df['folder'].isin(range(40, 50))]
|
| 48 |
+
else:
|
| 49 |
+
raise NotImplementedError('Unknown split: {}'.format(split))
|
| 50 |
+
elif dataset.startswith('ff-c23-720-140-140'):
|
| 51 |
+
# Save random state
|
| 52 |
+
st0 = np.random.get_state()
|
| 53 |
+
# Set seed for this selection only
|
| 54 |
+
np.random.seed(41)
|
| 55 |
+
# Split on original videos
|
| 56 |
+
crf = dataset.split('-')[1]
|
| 57 |
+
random_youtube_videos = np.random.permutation(
|
| 58 |
+
df[(df['source'] == 'youtube') & (df['quality'] == crf)]['video'].unique())
|
| 59 |
+
train_orig = random_youtube_videos[:720]
|
| 60 |
+
val_orig = random_youtube_videos[720:720 + 140]
|
| 61 |
+
test_orig = random_youtube_videos[720 + 140:]
|
| 62 |
+
if split == 'train':
|
| 63 |
+
split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
|
| 64 |
+
elif split == 'val':
|
| 65 |
+
split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
|
| 66 |
+
elif split == 'test':
|
| 67 |
+
split_df = pd.concat((df[df['original'].isin(test_orig)], df[df['video'].isin(test_orig)]), axis=0)
|
| 68 |
+
else:
|
| 69 |
+
raise NotImplementedError('Unknown split: {}'.format(split))
|
| 70 |
+
|
| 71 |
+
if dataset.endswith('fpv'):
|
| 72 |
+
fpv = int(dataset.rsplit('-', 1)[1][:-3])
|
| 73 |
+
idxs = []
|
| 74 |
+
for video in split_df['video'].unique():
|
| 75 |
+
idxs.append(np.random.choice(split_df[split_df['video'] == video].index, fpv, replace=False))
|
| 76 |
+
idxs = np.concatenate(idxs)
|
| 77 |
+
split_df = split_df.loc[idxs]
|
| 78 |
+
# Restore random state
|
| 79 |
+
np.random.set_state(st0)
|
| 80 |
+
elif dataset == 'celebdf':
|
| 81 |
+
|
| 82 |
+
seed = 41
|
| 83 |
+
num_real_train = 600
|
| 84 |
+
|
| 85 |
+
# Save random state
|
| 86 |
+
st0 = np.random.get_state()
|
| 87 |
+
# Set seed for this selection only
|
| 88 |
+
np.random.seed(seed)
|
| 89 |
+
# Split on original videos
|
| 90 |
+
random_train_val_real_videos = np.random.permutation(
|
| 91 |
+
df[(df['label'] == False) & (df['test'] == False)]['video'].unique())
|
| 92 |
+
train_orig = random_train_val_real_videos[:num_real_train]
|
| 93 |
+
val_orig = random_train_val_real_videos[num_real_train:]
|
| 94 |
+
if split == 'train':
|
| 95 |
+
split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
|
| 96 |
+
elif split == 'val':
|
| 97 |
+
split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
|
| 98 |
+
elif split == 'test':
|
| 99 |
+
split_df = df[df['test'] == True]
|
| 100 |
+
else:
|
| 101 |
+
raise NotImplementedError('Unknown split: {}'.format(split))
|
| 102 |
+
# Restore random state
|
| 103 |
+
np.random.set_state(st0)
|
| 104 |
+
else:
|
| 105 |
+
raise NotImplementedError('Unknown dataset: {}'.format(dataset))
|
| 106 |
+
return split_df
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def make_splits(dfdc_df: str, ffpp_df: str, dfdc_dir: str, ffpp_dir: str, dbs: Dict[str, List[str]]) -> Dict[str, Dict[str, Tuple[pd.DataFrame, str]]]:
|
| 110 |
+
"""
|
| 111 |
+
Make split and return Dataframe and root
|
| 112 |
+
:param
|
| 113 |
+
dfdc_df: str, path to the DataFrame containing info on the faces extracted from the DFDC dataset with extract_faces.py
|
| 114 |
+
ffpp_df: str, path to the DataFrame containing info on the faces extracted from the FF++ dataset with extract_faces.py
|
| 115 |
+
dfdc_dir: str, path to the directory containing the faces extracted from the DFDC dataset with extract_faces.py
|
| 116 |
+
ffpp_dir: str, path to the directory containing the faces extracted from the FF++ dataset with extract_faces.py
|
| 117 |
+
dbs: {split_name:[split_dataset1,split_dataset2,...]}
|
| 118 |
+
Example:
|
| 119 |
+
{'train':['dfdc-35-5-15',],'val':['dfdc-35-5-15',]}
|
| 120 |
+
:return: split_dict: dictonary containing {split_name: ['train', 'val'], splitdb: List(pandas.DataFrame, str)}
|
| 121 |
+
Example:
|
| 122 |
+
{'train, 'dfdc-35-5-15': (dfdc_train_df, 'path/to/dir/of/DFDC/faces')}
|
| 123 |
+
"""
|
| 124 |
+
split_dict = {}
|
| 125 |
+
full_dfs = {}
|
| 126 |
+
for split_name, split_dbs in dbs.items():
|
| 127 |
+
split_dict[split_name] = dict()
|
| 128 |
+
for split_db in split_dbs:
|
| 129 |
+
if split_db not in full_dfs:
|
| 130 |
+
full_dfs[split_db] = load_df(dfdc_df, ffpp_df, dfdc_dir, ffpp_dir, split_db)
|
| 131 |
+
full_df, root = full_dfs[split_db]
|
| 132 |
+
split_df = get_split_df(df=full_df, dataset=split_db, split=split_name)
|
| 133 |
+
split_dict[split_name][split_db] = (split_df, root)
|
| 134 |
+
|
| 135 |
+
return split_dict
|
models/icpr2020dfdc/isplutils/utils.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
from pprint import pprint
|
| 13 |
+
from typing import Iterable, List
|
| 14 |
+
|
| 15 |
+
import albumentations as A
|
| 16 |
+
import cv2
|
| 17 |
+
import numpy as np
|
| 18 |
+
import scipy
|
| 19 |
+
import torch
|
| 20 |
+
from PIL import Image
|
| 21 |
+
from albumentations.pytorch import ToTensorV2
|
| 22 |
+
from matplotlib import pyplot as plt
|
| 23 |
+
from torch import nn as nn
|
| 24 |
+
from torchvision import transforms
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def extract_meta_av(path: str) -> (int, int, int):
|
| 28 |
+
"""
|
| 29 |
+
Extract video height, width and number of frames to index the files
|
| 30 |
+
:param path:
|
| 31 |
+
:return:
|
| 32 |
+
"""
|
| 33 |
+
import av
|
| 34 |
+
try:
|
| 35 |
+
video = av.open(path)
|
| 36 |
+
video_stream = video.streams.video[0]
|
| 37 |
+
return video_stream.height, video_stream.width, video_stream.frames
|
| 38 |
+
except av.AVError as e:
|
| 39 |
+
print('Error while reading file: {}'.format(path))
|
| 40 |
+
print(e)
|
| 41 |
+
return 0, 0, 0
|
| 42 |
+
except IndexError as e:
|
| 43 |
+
print('Error while processing file: {}'.format(path))
|
| 44 |
+
print(e)
|
| 45 |
+
return 0, 0, 0
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def extract_meta_cv(path: str) -> (int, int, int):
|
| 49 |
+
"""
|
| 50 |
+
Extract video height, width and number of frames to index the files
|
| 51 |
+
:param path:
|
| 52 |
+
:return:
|
| 53 |
+
"""
|
| 54 |
+
try:
|
| 55 |
+
vid = cv2.VideoCapture(path)
|
| 56 |
+
num_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 57 |
+
height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 58 |
+
width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 59 |
+
return height, width, num_frames
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print('Error while reading file: {}'.format(path))
|
| 62 |
+
print(e)
|
| 63 |
+
return 0, 0, 0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def adapt_bb(frame_height: int, frame_width: int, bb_height: int, bb_width: int, left: int, top: int, right: int,
|
| 67 |
+
bottom: int) -> (
|
| 68 |
+
int, int, int, int):
|
| 69 |
+
x_ctr = (left + right) // 2
|
| 70 |
+
y_ctr = (bottom + top) // 2
|
| 71 |
+
new_top = max(y_ctr - bb_height // 2, 0)
|
| 72 |
+
new_bottom = min(new_top + bb_height, frame_height)
|
| 73 |
+
new_left = max(x_ctr - bb_width // 2, 0)
|
| 74 |
+
new_right = min(new_left + bb_width, frame_width)
|
| 75 |
+
return new_left, new_top, new_right, new_bottom
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def extract_bb(frame: Image.Image, bb: Iterable, scale: str, size: int) -> Image.Image:
|
| 79 |
+
"""
|
| 80 |
+
Extract a face from a frame according to the given bounding box and scale policy
|
| 81 |
+
:param frame: Entire frame
|
| 82 |
+
:param bb: Bounding box (left,top,right,bottom) in the reference system of the frame
|
| 83 |
+
:param scale: "scale" to crop a square with size equal to the maximum between height and width of the face, then scale to size
|
| 84 |
+
"crop" to crop a fixed square around face center,
|
| 85 |
+
"tight" to crop face exactly at the bounding box with no scaling
|
| 86 |
+
:param size: size of the face
|
| 87 |
+
:return:
|
| 88 |
+
"""
|
| 89 |
+
left, top, right, bottom = bb
|
| 90 |
+
if scale == "scale":
|
| 91 |
+
bb_width = int(right) - int(left)
|
| 92 |
+
bb_height = int(bottom) - int(top)
|
| 93 |
+
bb_to_desired_ratio = min(size / bb_height, size / bb_width) if (bb_width > 0 and bb_height > 0) else 1.
|
| 94 |
+
bb_width = int(size / bb_to_desired_ratio)
|
| 95 |
+
bb_height = int(size / bb_to_desired_ratio)
|
| 96 |
+
left, top, right, bottom = adapt_bb(frame.height, frame.width, bb_height, bb_width, left, top, right,
|
| 97 |
+
bottom)
|
| 98 |
+
face = frame.crop((left, top, right, bottom)).resize((size, size), Image.BILINEAR)
|
| 99 |
+
elif scale == "crop":
|
| 100 |
+
# Find the center of the bounding box and cut an area around it of height x width
|
| 101 |
+
left, top, right, bottom = adapt_bb(frame.height, frame.width, size, size, left, top, right,
|
| 102 |
+
bottom)
|
| 103 |
+
face = frame.crop((left, top, right, bottom))
|
| 104 |
+
elif scale == "tight":
|
| 105 |
+
left, top, right, bottom = adapt_bb(frame.height, frame.width, bottom - top, right - left, left, top, right,
|
| 106 |
+
bottom)
|
| 107 |
+
face = frame.crop((left, top, right, bottom))
|
| 108 |
+
else:
|
| 109 |
+
raise ValueError('Unknown scale value: {}'.format(scale))
|
| 110 |
+
|
| 111 |
+
return face
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def showimage(img_tensor: torch.Tensor):
|
| 115 |
+
topil = transforms.Compose([
|
| 116 |
+
transforms.Normalize(mean=[0, 0, 0, ], std=[1 / 0.229, 1 / 0.224, 1 / 0.225]),
|
| 117 |
+
transforms.Normalize(mean=[-0.485, -0.456, -0.406], std=[1, 1, 1]),
|
| 118 |
+
transforms.ToPILImage()
|
| 119 |
+
])
|
| 120 |
+
plt.figure()
|
| 121 |
+
plt.imshow(topil(img_tensor))
|
| 122 |
+
plt.show()
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def make_train_tag(net_class: nn.Module,
|
| 126 |
+
face_policy: str,
|
| 127 |
+
patch_size: int,
|
| 128 |
+
traindb: List[str],
|
| 129 |
+
seed: int,
|
| 130 |
+
suffix: str,
|
| 131 |
+
debug: bool,
|
| 132 |
+
):
|
| 133 |
+
# Training parameters and tag
|
| 134 |
+
tag_params = dict(net=net_class.__name__,
|
| 135 |
+
traindb='-'.join(traindb),
|
| 136 |
+
face=face_policy,
|
| 137 |
+
size=patch_size,
|
| 138 |
+
seed=seed
|
| 139 |
+
)
|
| 140 |
+
print('Parameters')
|
| 141 |
+
pprint(tag_params)
|
| 142 |
+
tag = 'debug_' if debug else ''
|
| 143 |
+
tag += '_'.join(['-'.join([key, str(tag_params[key])]) for key in tag_params])
|
| 144 |
+
if suffix is not None:
|
| 145 |
+
tag += '_' + suffix
|
| 146 |
+
print('Tag: {:s}'.format(tag))
|
| 147 |
+
return tag
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def get_transformer(face_policy: str, patch_size: int, net_normalizer: transforms.Normalize, train: bool):
|
| 151 |
+
# Transformers and traindb
|
| 152 |
+
if face_policy == 'scale':
|
| 153 |
+
# The loader crops the face isotropically then scales to a square of size patch_size_load
|
| 154 |
+
loading_transformations = [
|
| 155 |
+
A.PadIfNeeded(min_height=patch_size, min_width=patch_size,
|
| 156 |
+
border_mode=cv2.BORDER_CONSTANT, value=0,always_apply=True),
|
| 157 |
+
A.Resize(height=patch_size,width=patch_size,always_apply=True),
|
| 158 |
+
]
|
| 159 |
+
if train:
|
| 160 |
+
downsample_train_transformations = [
|
| 161 |
+
A.Downscale(scale_max=0.5, scale_min=0.5, p=0.5), # replaces scaled dataset
|
| 162 |
+
]
|
| 163 |
+
else:
|
| 164 |
+
downsample_train_transformations = []
|
| 165 |
+
elif face_policy == 'tight':
|
| 166 |
+
# The loader crops the face tightly without any scaling
|
| 167 |
+
loading_transformations = [
|
| 168 |
+
A.LongestMaxSize(max_size=patch_size, always_apply=True),
|
| 169 |
+
A.PadIfNeeded(min_height=patch_size, min_width=patch_size,
|
| 170 |
+
border_mode=cv2.BORDER_CONSTANT, value=0,always_apply=True),
|
| 171 |
+
]
|
| 172 |
+
if train:
|
| 173 |
+
downsample_train_transformations = [
|
| 174 |
+
A.Downscale(scale_max=0.5, scale_min=0.5, p=0.5), # replaces scaled dataset
|
| 175 |
+
]
|
| 176 |
+
else:
|
| 177 |
+
downsample_train_transformations = []
|
| 178 |
+
else:
|
| 179 |
+
raise ValueError('Unknown value for face_policy: {}'.format(face_policy))
|
| 180 |
+
|
| 181 |
+
if train:
|
| 182 |
+
aug_transformations = [
|
| 183 |
+
A.Compose([
|
| 184 |
+
A.HorizontalFlip(),
|
| 185 |
+
A.OneOf([
|
| 186 |
+
A.RandomBrightnessContrast(),
|
| 187 |
+
A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=30, val_shift_limit=20),
|
| 188 |
+
]),
|
| 189 |
+
A.OneOf([
|
| 190 |
+
A.ISONoise(),
|
| 191 |
+
A.IAAAdditiveGaussianNoise(scale=(0.01 * 255, 0.03 * 255)),
|
| 192 |
+
]),
|
| 193 |
+
A.Downscale(scale_min=0.7, scale_max=0.9, interpolation=cv2.INTER_LINEAR),
|
| 194 |
+
A.ImageCompression(quality_lower=50, quality_upper=99),
|
| 195 |
+
], )
|
| 196 |
+
]
|
| 197 |
+
else:
|
| 198 |
+
aug_transformations = []
|
| 199 |
+
|
| 200 |
+
# Common final transformations
|
| 201 |
+
final_transformations = [
|
| 202 |
+
A.Normalize(mean=net_normalizer.mean, std=net_normalizer.std, ),
|
| 203 |
+
ToTensorV2(),
|
| 204 |
+
]
|
| 205 |
+
transf = A.Compose(
|
| 206 |
+
loading_transformations + downsample_train_transformations + aug_transformations + final_transformations)
|
| 207 |
+
return transf
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def aggregate(x, deadzone: float, pre_mult: float, policy: str, post_mult: float, clipmargin: float, params={}):
|
| 211 |
+
x = x.copy()
|
| 212 |
+
if deadzone > 0:
|
| 213 |
+
x = x[(x > deadzone) | (x < -deadzone)]
|
| 214 |
+
if len(x) == 0:
|
| 215 |
+
x = np.asarray([0, ])
|
| 216 |
+
if policy == 'mean':
|
| 217 |
+
x = np.mean(x)
|
| 218 |
+
x = scipy.special.expit(x * pre_mult)
|
| 219 |
+
x = (x - 0.5) * post_mult + 0.5
|
| 220 |
+
elif policy == 'sigmean':
|
| 221 |
+
x = scipy.special.expit(x * pre_mult).mean()
|
| 222 |
+
x = (x - 0.5) * post_mult + 0.5
|
| 223 |
+
elif policy == 'meanp':
|
| 224 |
+
pow_coeff = params.pop('p', 3)
|
| 225 |
+
x = np.mean(np.sign(x) * (np.abs(x) ** pow_coeff))
|
| 226 |
+
x = np.sign(x) * (np.abs(x) ** (1 / pow_coeff))
|
| 227 |
+
x = scipy.special.expit(x * pre_mult)
|
| 228 |
+
x = (x - 0.5) * post_mult + 0.5
|
| 229 |
+
elif policy == 'median':
|
| 230 |
+
x = scipy.special.expit(np.median(x) * pre_mult)
|
| 231 |
+
x = (x - 0.5) * post_mult + 0.5
|
| 232 |
+
elif policy == 'sigmedian':
|
| 233 |
+
x = np.median(scipy.special.expit(x * pre_mult))
|
| 234 |
+
x = (x - 0.5) * post_mult + 0.5
|
| 235 |
+
elif policy == 'maxabs':
|
| 236 |
+
x = np.min(x) if abs(np.min(x)) > abs(np.max(x)) else np.max(x)
|
| 237 |
+
x = scipy.special.expit(x * pre_mult)
|
| 238 |
+
x = (x - 0.5) * post_mult + 0.5
|
| 239 |
+
elif policy == 'avgvoting':
|
| 240 |
+
x = np.mean(np.sign(x))
|
| 241 |
+
x = (x * post_mult + 1) / 2
|
| 242 |
+
elif policy == 'voting':
|
| 243 |
+
x = np.sign(np.mean(x * pre_mult))
|
| 244 |
+
x = (x - 0.5) * post_mult + 0.5
|
| 245 |
+
else:
|
| 246 |
+
raise NotImplementedError()
|
| 247 |
+
return np.clip(x, clipmargin, 1 - clipmargin)
|
models/icpr2020dfdc/test_model.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
import argparse
|
| 13 |
+
import gc
|
| 14 |
+
from collections import OrderedDict
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import albumentations as A
|
| 18 |
+
import matplotlib.pyplot as plt
|
| 19 |
+
import numpy as np
|
| 20 |
+
import pandas as pd
|
| 21 |
+
import torch
|
| 22 |
+
import torch.nn as nn
|
| 23 |
+
from torch.utils.data import DataLoader
|
| 24 |
+
from tqdm import tqdm
|
| 25 |
+
|
| 26 |
+
from architectures import fornet
|
| 27 |
+
from architectures.fornet import FeatureExtractor
|
| 28 |
+
from isplutils import utils, split
|
| 29 |
+
from isplutils.data import FrameFaceDatasetTest
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main():
|
| 33 |
+
# Args
|
| 34 |
+
parser = argparse.ArgumentParser()
|
| 35 |
+
|
| 36 |
+
parser.add_argument('--testsets', type=str, help='Testing datasets', nargs='+', choices=split.available_datasets,
|
| 37 |
+
required=True)
|
| 38 |
+
parser.add_argument('--testsplits', type=str, help='Test split', nargs='+', default=['val', 'test'],
|
| 39 |
+
choices=['train', 'val', 'test'])
|
| 40 |
+
parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
|
| 41 |
+
help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
|
| 42 |
+
'Required for training/validating on the DFDC dataset.')
|
| 43 |
+
parser.add_argument('--dfdc_faces_dir', type=str, action='store',
|
| 44 |
+
help='Path to the directory containing the faces extracted from the DFDC dataset. '
|
| 45 |
+
'Required for training/validating on the DFDC dataset.')
|
| 46 |
+
parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
|
| 47 |
+
help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
|
| 48 |
+
'Required for training/validating on the FF++ dataset.')
|
| 49 |
+
parser.add_argument('--ffpp_faces_dir', type=str, action='store',
|
| 50 |
+
help='Path to the directory containing the faces extracted from the FF++ dataset. '
|
| 51 |
+
'Required for training/validating on the FF++ dataset.')
|
| 52 |
+
|
| 53 |
+
# Specify trained model path
|
| 54 |
+
parser.add_argument('--model_path', type=Path, help='Full path of the trained model', required=True)
|
| 55 |
+
|
| 56 |
+
# Common params
|
| 57 |
+
parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=128)
|
| 58 |
+
|
| 59 |
+
parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
|
| 60 |
+
parser.add_argument('--device', type=int, help='GPU id', default=0)
|
| 61 |
+
|
| 62 |
+
parser.add_argument('--debug', action='store_true', help='Debug flag', )
|
| 63 |
+
parser.add_argument('--num_video', type=int, help='Number of real-fake videos to test')
|
| 64 |
+
parser.add_argument('--results_dir', type=Path, help='Output folder',
|
| 65 |
+
default='results/')
|
| 66 |
+
|
| 67 |
+
parser.add_argument('--override', action='store_true', help='Override existing results', )
|
| 68 |
+
|
| 69 |
+
args = parser.parse_args()
|
| 70 |
+
|
| 71 |
+
device = torch.device('cuda:{}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
|
| 72 |
+
num_workers: int = args.workers
|
| 73 |
+
batch_size: int = args.batch
|
| 74 |
+
max_num_videos_per_label: int = args.num_video # number of real-fake videos to test
|
| 75 |
+
model_path: Path = args.model_path
|
| 76 |
+
results_dir: Path = args.results_dir
|
| 77 |
+
debug: bool = args.debug
|
| 78 |
+
override: bool = args.override
|
| 79 |
+
test_sets = args.testsets
|
| 80 |
+
test_splits = args.testsplits
|
| 81 |
+
dfdc_df_path = args.dfdc_faces_df_path
|
| 82 |
+
ffpp_df_path = args.ffpp_faces_df_path
|
| 83 |
+
dfdc_faces_dir = args.dfdc_faces_dir
|
| 84 |
+
ffpp_faces_dir = args.ffpp_faces_dir
|
| 85 |
+
|
| 86 |
+
# get arguments from the model path
|
| 87 |
+
face_policy = str(model_path).split('face-')[1].split('_')[0]
|
| 88 |
+
patch_size = int(str(model_path).split('size-')[1].split('_')[0])
|
| 89 |
+
net_name = str(model_path).split('net-')[1].split('_')[0]
|
| 90 |
+
model_name = '_'.join(model_path.with_suffix('').parts[-2:])
|
| 91 |
+
|
| 92 |
+
# Load net
|
| 93 |
+
net_class = getattr(fornet, net_name)
|
| 94 |
+
|
| 95 |
+
# load model
|
| 96 |
+
print('Loading model...')
|
| 97 |
+
state_tmp = torch.load(model_path, map_location='cpu')
|
| 98 |
+
if 'net' not in state_tmp.keys():
|
| 99 |
+
state = OrderedDict({'net': OrderedDict()})
|
| 100 |
+
[state['net'].update({'model.{}'.format(k): v}) for k, v in state_tmp.items()]
|
| 101 |
+
else:
|
| 102 |
+
state = state_tmp
|
| 103 |
+
net: FeatureExtractor = net_class().eval().to(device)
|
| 104 |
+
|
| 105 |
+
incomp_keys = net.load_state_dict(state['net'], strict=True)
|
| 106 |
+
print(incomp_keys)
|
| 107 |
+
print('Model loaded!')
|
| 108 |
+
|
| 109 |
+
# val loss per-frame
|
| 110 |
+
criterion = nn.BCEWithLogitsLoss(reduction='none')
|
| 111 |
+
|
| 112 |
+
# Define data transformers
|
| 113 |
+
test_transformer = utils.get_transformer(face_policy, patch_size, net.get_normalizer(), train=False)
|
| 114 |
+
|
| 115 |
+
# datasets and dataloaders (from train_binclass.py)
|
| 116 |
+
print('Loading data...')
|
| 117 |
+
# Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
|
| 118 |
+
for dataset in test_sets:
|
| 119 |
+
if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
|
| 120 |
+
raise RuntimeError('Specify DataFrame and directory for DFDC faces for testing!')
|
| 121 |
+
elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
|
| 122 |
+
raise RuntimeError('Specify DataFrame and directory for FF++ faces for testing!')
|
| 123 |
+
splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir,
|
| 124 |
+
ffpp_dir=ffpp_faces_dir, dbs={'train': test_sets, 'val': test_sets, 'test': test_sets})
|
| 125 |
+
train_dfs = [splits['train'][db][0] for db in splits['train']]
|
| 126 |
+
train_roots = [splits['train'][db][1] for db in splits['train']]
|
| 127 |
+
val_roots = [splits['val'][db][1] for db in splits['val']]
|
| 128 |
+
val_dfs = [splits['val'][db][0] for db in splits['val']]
|
| 129 |
+
test_dfs = [splits['test'][db][0] for db in splits['test']]
|
| 130 |
+
test_roots = [splits['test'][db][1] for db in splits['test']]
|
| 131 |
+
|
| 132 |
+
# Output paths
|
| 133 |
+
out_folder = results_dir.joinpath(model_name)
|
| 134 |
+
out_folder.mkdir(mode=0o775, parents=True, exist_ok=True)
|
| 135 |
+
|
| 136 |
+
# Samples selection
|
| 137 |
+
if max_num_videos_per_label and max_num_videos_per_label > 0:
|
| 138 |
+
dfs_out_train = [select_videos(df, max_num_videos_per_label) for df in train_dfs]
|
| 139 |
+
dfs_out_val = [select_videos(df, max_num_videos_per_label) for df in val_dfs]
|
| 140 |
+
dfs_out_test = [select_videos(df, max_num_videos_per_label) for df in test_dfs]
|
| 141 |
+
else:
|
| 142 |
+
dfs_out_train = train_dfs
|
| 143 |
+
dfs_out_val = val_dfs
|
| 144 |
+
dfs_out_test = test_dfs
|
| 145 |
+
|
| 146 |
+
# Extractions list
|
| 147 |
+
extr_list = []
|
| 148 |
+
# Append train and validation set first
|
| 149 |
+
if 'train' in test_splits:
|
| 150 |
+
for idx, dataset in enumerate(test_sets):
|
| 151 |
+
extr_list.append(
|
| 152 |
+
(dfs_out_train[idx], out_folder.joinpath(dataset + '_train.pkl'), train_roots[idx], dataset + ' TRAIN')
|
| 153 |
+
)
|
| 154 |
+
if 'val' in test_splits:
|
| 155 |
+
for idx, dataset in enumerate(test_sets):
|
| 156 |
+
extr_list.append(
|
| 157 |
+
(dfs_out_val[idx], out_folder.joinpath(dataset + '_val.pkl'), val_roots[idx], dataset + ' VAL')
|
| 158 |
+
)
|
| 159 |
+
if 'test' in test_splits:
|
| 160 |
+
for idx, dataset in enumerate(test_sets):
|
| 161 |
+
extr_list.append(
|
| 162 |
+
(dfs_out_test[idx], out_folder.joinpath(dataset + '_test.pkl'), test_roots[idx], dataset + ' TEST')
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
for df, df_path, df_root, tag in extr_list:
|
| 166 |
+
if override or not df_path.exists():
|
| 167 |
+
print('\n##### PREDICT VIDEOS FROM {} #####'.format(tag))
|
| 168 |
+
print('Real frames: {}'.format(sum(df['label'] == False)))
|
| 169 |
+
print('Fake frames: {}'.format(sum(df['label'] == True)))
|
| 170 |
+
print('Real videos: {}'.format(df[df['label'] == False]['video'].nunique()))
|
| 171 |
+
print('Fake videos: {}'.format(df[df['label'] == True]['video'].nunique()))
|
| 172 |
+
dataset_out = process_dataset(root=df_root, df=df, net=net, criterion=criterion,
|
| 173 |
+
patch_size=patch_size,
|
| 174 |
+
face_policy=face_policy, transformer=test_transformer,
|
| 175 |
+
batch_size=batch_size,
|
| 176 |
+
num_workers=num_workers, device=device, )
|
| 177 |
+
df['score'] = dataset_out['score'].astype(np.float32)
|
| 178 |
+
df['loss'] = dataset_out['loss'].astype(np.float32)
|
| 179 |
+
print('Saving results to: {}'.format(df_path))
|
| 180 |
+
df.to_pickle(str(df_path))
|
| 181 |
+
|
| 182 |
+
if debug:
|
| 183 |
+
plt.figure()
|
| 184 |
+
plt.title(tag)
|
| 185 |
+
plt.hist(df[df.label == True].score, bins=100, alpha=0.6, label='FAKE frames')
|
| 186 |
+
plt.hist(df[df.label == False].score, bins=100, alpha=0.6, label='REAL frames')
|
| 187 |
+
plt.legend()
|
| 188 |
+
|
| 189 |
+
del (dataset_out)
|
| 190 |
+
del (df)
|
| 191 |
+
gc.collect()
|
| 192 |
+
|
| 193 |
+
if debug:
|
| 194 |
+
plt.show()
|
| 195 |
+
|
| 196 |
+
print('Completed!')
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def process_dataset(df: pd.DataFrame,
|
| 200 |
+
root: str,
|
| 201 |
+
net: FeatureExtractor,
|
| 202 |
+
criterion,
|
| 203 |
+
patch_size: int,
|
| 204 |
+
face_policy: str,
|
| 205 |
+
transformer: A.BasicTransform,
|
| 206 |
+
batch_size: int,
|
| 207 |
+
num_workers: int,
|
| 208 |
+
device: torch.device,
|
| 209 |
+
) -> dict:
|
| 210 |
+
if isinstance(device, (int, str)):
|
| 211 |
+
device = torch.device(device)
|
| 212 |
+
|
| 213 |
+
dataset = FrameFaceDatasetTest(
|
| 214 |
+
root=root,
|
| 215 |
+
df=df,
|
| 216 |
+
size=patch_size,
|
| 217 |
+
scale=face_policy,
|
| 218 |
+
transformer=transformer,
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# Preallocate
|
| 222 |
+
score = np.zeros(len(df))
|
| 223 |
+
loss = np.zeros(len(df))
|
| 224 |
+
|
| 225 |
+
loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, drop_last=False)
|
| 226 |
+
with torch.no_grad():
|
| 227 |
+
idx0 = 0
|
| 228 |
+
for batch_data in tqdm(loader):
|
| 229 |
+
batch_images = batch_data[0].to(device)
|
| 230 |
+
batch_labels = batch_data[1].to(device)
|
| 231 |
+
batch_samples = len(batch_images)
|
| 232 |
+
batch_out = net(batch_images)
|
| 233 |
+
batch_loss = criterion(batch_out, batch_labels)
|
| 234 |
+
score[idx0:idx0 + batch_samples] = batch_out.cpu().numpy()[:, 0]
|
| 235 |
+
loss[idx0:idx0 + batch_samples] = batch_loss.cpu().numpy()[:, 0]
|
| 236 |
+
idx0 += batch_samples
|
| 237 |
+
|
| 238 |
+
out_dict = {'score': score, 'loss': loss}
|
| 239 |
+
return out_dict
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def select_videos(df: pd.DataFrame, max_videos_per_label: int) -> pd.DataFrame:
|
| 243 |
+
"""
|
| 244 |
+
Select up to a maximum number of videos
|
| 245 |
+
:param df: DataFrame of frames. Required columns: 'video','label'
|
| 246 |
+
:param max_videos_per_label: maximum number of real and fake videos
|
| 247 |
+
:return: DataFrame of selected frames
|
| 248 |
+
"""
|
| 249 |
+
# Save random state
|
| 250 |
+
st0 = np.random.get_state()
|
| 251 |
+
# Set seed for this selection only
|
| 252 |
+
np.random.seed(42)
|
| 253 |
+
|
| 254 |
+
df_fake = df[df.label == True]
|
| 255 |
+
fake_videos = df_fake['video'].unique()
|
| 256 |
+
selected_fake_videos = np.random.choice(fake_videos, min(max_videos_per_label, len(fake_videos)), replace=False)
|
| 257 |
+
df_selected_fake_frames = df_fake[df_fake['video'].isin(selected_fake_videos)]
|
| 258 |
+
|
| 259 |
+
df_real = df[df.label == False]
|
| 260 |
+
real_videos = df_real['video'].unique()
|
| 261 |
+
selected_real_videos = np.random.choice(real_videos, min(max_videos_per_label, len(real_videos)), replace=False)
|
| 262 |
+
df_selected_real_frames = df_real[df_real['video'].isin(selected_real_videos)]
|
| 263 |
+
# Restore random state
|
| 264 |
+
np.random.set_state(st0)
|
| 265 |
+
|
| 266 |
+
return pd.concat((df_selected_fake_frames, df_selected_real_frames), axis=0, verify_integrity=True).copy()
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
if __name__ == '__main__':
|
| 270 |
+
main()
|
models/icpr2020dfdc/train_binclass.py
ADDED
|
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
import argparse
|
| 13 |
+
import os
|
| 14 |
+
import shutil
|
| 15 |
+
import warnings
|
| 16 |
+
|
| 17 |
+
import albumentations as A
|
| 18 |
+
import numpy as np
|
| 19 |
+
import pandas as pd
|
| 20 |
+
import torch
|
| 21 |
+
import torch.multiprocessing
|
| 22 |
+
from torchvision.transforms import ToPILImage, ToTensor
|
| 23 |
+
|
| 24 |
+
from isplutils import utils, split
|
| 25 |
+
|
| 26 |
+
torch.multiprocessing.set_sharing_strategy('file_system')
|
| 27 |
+
import torch.nn as nn
|
| 28 |
+
from albumentations.pytorch import ToTensorV2
|
| 29 |
+
from sklearn.metrics import roc_auc_score
|
| 30 |
+
from tensorboardX import SummaryWriter
|
| 31 |
+
from torch import optim
|
| 32 |
+
from torch.utils.data import DataLoader
|
| 33 |
+
from tqdm import tqdm
|
| 34 |
+
from PIL import ImageChops, Image
|
| 35 |
+
|
| 36 |
+
from architectures import fornet
|
| 37 |
+
from isplutils.data import FrameFaceIterableDataset, load_face
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main():
|
| 41 |
+
# Args
|
| 42 |
+
parser = argparse.ArgumentParser()
|
| 43 |
+
parser.add_argument('--net', type=str, help='Net model class', required=True)
|
| 44 |
+
parser.add_argument('--traindb', type=str, help='Training datasets', nargs='+', choices=split.available_datasets,
|
| 45 |
+
required=True)
|
| 46 |
+
parser.add_argument('--valdb', type=str, help='Validation datasets', nargs='+', choices=split.available_datasets,
|
| 47 |
+
required=True)
|
| 48 |
+
parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
|
| 49 |
+
help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
|
| 50 |
+
'Required for training/validating on the DFDC dataset.')
|
| 51 |
+
parser.add_argument('--dfdc_faces_dir', type=str, action='store',
|
| 52 |
+
help='Path to the directory containing the faces extracted from the DFDC dataset. '
|
| 53 |
+
'Required for training/validating on the DFDC dataset.')
|
| 54 |
+
parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
|
| 55 |
+
help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
|
| 56 |
+
'Required for training/validating on the FF++ dataset.')
|
| 57 |
+
parser.add_argument('--ffpp_faces_dir', type=str, action='store',
|
| 58 |
+
help='Path to the directory containing the faces extracted from the FF++ dataset. '
|
| 59 |
+
'Required for training/validating on the FF++ dataset.')
|
| 60 |
+
parser.add_argument('--face', type=str, help='Face crop or scale', required=True,
|
| 61 |
+
choices=['scale', 'tight'])
|
| 62 |
+
parser.add_argument('--size', type=int, help='Train patch size', required=True)
|
| 63 |
+
|
| 64 |
+
parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=32)
|
| 65 |
+
parser.add_argument('--lr', type=float, default=1e-5, help='Learning rate')
|
| 66 |
+
parser.add_argument('--valint', type=int, help='Validation interval (iterations)', default=500)
|
| 67 |
+
parser.add_argument('--patience', type=int, help='Patience before dropping the LR [validation intervals]',
|
| 68 |
+
default=10)
|
| 69 |
+
parser.add_argument('--maxiter', type=int, help='Maximum number of iterations', default=20000)
|
| 70 |
+
parser.add_argument('--init', type=str, help='Weight initialization file')
|
| 71 |
+
parser.add_argument('--scratch', action='store_true', help='Train from scratch')
|
| 72 |
+
|
| 73 |
+
parser.add_argument('--trainsamples', type=int, help='Limit the number of train samples per epoch', default=-1)
|
| 74 |
+
parser.add_argument('--valsamples', type=int, help='Limit the number of validation samples per epoch',
|
| 75 |
+
default=6000)
|
| 76 |
+
|
| 77 |
+
parser.add_argument('--logint', type=int, help='Training log interval (iterations)', default=100)
|
| 78 |
+
parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
|
| 79 |
+
parser.add_argument('--device', type=int, help='GPU device id', default=0)
|
| 80 |
+
parser.add_argument('--seed', type=int, help='Random seed', default=0)
|
| 81 |
+
|
| 82 |
+
parser.add_argument('--debug', action='store_true', help='Activate debug')
|
| 83 |
+
parser.add_argument('--suffix', type=str, help='Suffix to default tag')
|
| 84 |
+
|
| 85 |
+
parser.add_argument('--attention', action='store_true',
|
| 86 |
+
help='Enable Tensorboard log of attention masks')
|
| 87 |
+
parser.add_argument('--log_dir', type=str, help='Directory for saving the training logs',
|
| 88 |
+
default='runs/binclass/')
|
| 89 |
+
parser.add_argument('--models_dir', type=str, help='Directory for saving the models weights',
|
| 90 |
+
default='weights/binclass/')
|
| 91 |
+
|
| 92 |
+
args = parser.parse_args()
|
| 93 |
+
|
| 94 |
+
# Parse arguments
|
| 95 |
+
net_class = getattr(fornet, args.net)
|
| 96 |
+
train_datasets = args.traindb
|
| 97 |
+
val_datasets = args.valdb
|
| 98 |
+
dfdc_df_path = args.dfdc_faces_df_path
|
| 99 |
+
ffpp_df_path = args.ffpp_faces_df_path
|
| 100 |
+
dfdc_faces_dir = args.dfdc_faces_dir
|
| 101 |
+
ffpp_faces_dir = args.ffpp_faces_dir
|
| 102 |
+
face_policy = args.face
|
| 103 |
+
face_size = args.size
|
| 104 |
+
|
| 105 |
+
batch_size = args.batch
|
| 106 |
+
initial_lr = args.lr
|
| 107 |
+
validation_interval = args.valint
|
| 108 |
+
patience = args.patience
|
| 109 |
+
max_num_iterations = args.maxiter
|
| 110 |
+
initial_model = args.init
|
| 111 |
+
train_from_scratch = args.scratch
|
| 112 |
+
|
| 113 |
+
max_train_samples = args.trainsamples
|
| 114 |
+
max_val_samples = args.valsamples
|
| 115 |
+
|
| 116 |
+
log_interval = args.logint
|
| 117 |
+
num_workers = args.workers
|
| 118 |
+
device = torch.device('cuda:{:d}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
|
| 119 |
+
seed = args.seed
|
| 120 |
+
|
| 121 |
+
debug = args.debug
|
| 122 |
+
suffix = args.suffix
|
| 123 |
+
|
| 124 |
+
enable_attention = args.attention
|
| 125 |
+
|
| 126 |
+
weights_folder = args.models_dir
|
| 127 |
+
logs_folder = args.log_dir
|
| 128 |
+
|
| 129 |
+
# Random initialization
|
| 130 |
+
np.random.seed(seed)
|
| 131 |
+
torch.random.manual_seed(seed)
|
| 132 |
+
|
| 133 |
+
# Load net
|
| 134 |
+
net: nn.Module = net_class().to(device)
|
| 135 |
+
|
| 136 |
+
# Loss and optimizers
|
| 137 |
+
criterion = nn.BCEWithLogitsLoss()
|
| 138 |
+
|
| 139 |
+
min_lr = initial_lr * 1e-5
|
| 140 |
+
optimizer = optim.Adam(net.get_trainable_parameters(), lr=initial_lr)
|
| 141 |
+
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
|
| 142 |
+
optimizer=optimizer,
|
| 143 |
+
mode='min',
|
| 144 |
+
factor=0.1,
|
| 145 |
+
patience=patience,
|
| 146 |
+
cooldown=2 * patience,
|
| 147 |
+
min_lr=min_lr,
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
tag = utils.make_train_tag(net_class=net_class,
|
| 151 |
+
traindb=train_datasets,
|
| 152 |
+
face_policy=face_policy,
|
| 153 |
+
patch_size=face_size,
|
| 154 |
+
seed=seed,
|
| 155 |
+
suffix=suffix,
|
| 156 |
+
debug=debug,
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
# Model checkpoint paths
|
| 160 |
+
bestval_path = os.path.join(weights_folder, tag, 'bestval.pth')
|
| 161 |
+
last_path = os.path.join(weights_folder, tag, 'last.pth')
|
| 162 |
+
periodic_path = os.path.join(weights_folder, tag, 'it{:06d}.pth')
|
| 163 |
+
|
| 164 |
+
os.makedirs(os.path.join(weights_folder, tag), exist_ok=True)
|
| 165 |
+
|
| 166 |
+
# Load model
|
| 167 |
+
val_loss = min_val_loss = 10
|
| 168 |
+
epoch = iteration = 0
|
| 169 |
+
net_state = None
|
| 170 |
+
opt_state = None
|
| 171 |
+
if initial_model is not None:
|
| 172 |
+
# If given load initial model
|
| 173 |
+
print('Loading model form: {}'.format(initial_model))
|
| 174 |
+
state = torch.load(initial_model, map_location='cpu')
|
| 175 |
+
net_state = state['net']
|
| 176 |
+
elif not train_from_scratch and os.path.exists(last_path):
|
| 177 |
+
print('Loading model form: {}'.format(last_path))
|
| 178 |
+
state = torch.load(last_path, map_location='cpu')
|
| 179 |
+
net_state = state['net']
|
| 180 |
+
opt_state = state['opt']
|
| 181 |
+
iteration = state['iteration'] + 1
|
| 182 |
+
epoch = state['epoch']
|
| 183 |
+
if not train_from_scratch and os.path.exists(bestval_path):
|
| 184 |
+
state = torch.load(bestval_path, map_location='cpu')
|
| 185 |
+
min_val_loss = state['val_loss']
|
| 186 |
+
if net_state is not None:
|
| 187 |
+
incomp_keys = net.load_state_dict(net_state, strict=False)
|
| 188 |
+
print(incomp_keys)
|
| 189 |
+
if opt_state is not None:
|
| 190 |
+
for param_group in opt_state['param_groups']:
|
| 191 |
+
param_group['lr'] = initial_lr
|
| 192 |
+
optimizer.load_state_dict(opt_state)
|
| 193 |
+
|
| 194 |
+
# Initialize Tensorboard
|
| 195 |
+
logdir = os.path.join(logs_folder, tag)
|
| 196 |
+
if iteration == 0:
|
| 197 |
+
# If training from scratch or initialization remove history if exists
|
| 198 |
+
shutil.rmtree(logdir, ignore_errors=True)
|
| 199 |
+
|
| 200 |
+
# TensorboardX instance
|
| 201 |
+
tb = SummaryWriter(logdir=logdir)
|
| 202 |
+
if iteration == 0:
|
| 203 |
+
dummy = torch.randn((1, 3, face_size, face_size), device=device)
|
| 204 |
+
dummy = dummy.to(device)
|
| 205 |
+
with warnings.catch_warnings():
|
| 206 |
+
warnings.simplefilter("ignore")
|
| 207 |
+
tb.add_graph(net, [dummy, ], verbose=False)
|
| 208 |
+
|
| 209 |
+
transformer = utils.get_transformer(face_policy=face_policy, patch_size=face_size,
|
| 210 |
+
net_normalizer=net.get_normalizer(), train=True)
|
| 211 |
+
|
| 212 |
+
# Datasets and data loaders
|
| 213 |
+
print('Loading data')
|
| 214 |
+
# Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
|
| 215 |
+
for dataset in train_datasets:
|
| 216 |
+
if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
|
| 217 |
+
raise RuntimeError('Specify DataFrame and directory for DFDC faces for training!')
|
| 218 |
+
elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
|
| 219 |
+
raise RuntimeError('Specify DataFrame and directory for FF++ faces for training!')
|
| 220 |
+
for dataset in val_datasets:
|
| 221 |
+
if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
|
| 222 |
+
raise RuntimeError('Specify DataFrame and directory for DFDC faces for validation!')
|
| 223 |
+
elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
|
| 224 |
+
raise RuntimeError('Specify DataFrame and directory for FF++ faces for validation!')
|
| 225 |
+
# Load splits with the make_splits function
|
| 226 |
+
splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir, ffpp_dir=ffpp_faces_dir,
|
| 227 |
+
dbs={'train': train_datasets, 'val': val_datasets})
|
| 228 |
+
train_dfs = [splits['train'][db][0] for db in splits['train']]
|
| 229 |
+
train_roots = [splits['train'][db][1] for db in splits['train']]
|
| 230 |
+
val_roots = [splits['val'][db][1] for db in splits['val']]
|
| 231 |
+
val_dfs = [splits['val'][db][0] for db in splits['val']]
|
| 232 |
+
|
| 233 |
+
train_dataset = FrameFaceIterableDataset(roots=train_roots,
|
| 234 |
+
dfs=train_dfs,
|
| 235 |
+
scale=face_policy,
|
| 236 |
+
num_samples=max_train_samples,
|
| 237 |
+
transformer=transformer,
|
| 238 |
+
size=face_size,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
val_dataset = FrameFaceIterableDataset(roots=val_roots,
|
| 242 |
+
dfs=val_dfs,
|
| 243 |
+
scale=face_policy,
|
| 244 |
+
num_samples=max_val_samples,
|
| 245 |
+
transformer=transformer,
|
| 246 |
+
size=face_size,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
train_loader = DataLoader(train_dataset, num_workers=num_workers, batch_size=batch_size, )
|
| 250 |
+
|
| 251 |
+
val_loader = DataLoader(val_dataset, num_workers=num_workers, batch_size=batch_size, )
|
| 252 |
+
|
| 253 |
+
print('Training samples: {}'.format(len(train_dataset)))
|
| 254 |
+
print('Validation samples: {}'.format(len(val_dataset)))
|
| 255 |
+
|
| 256 |
+
if len(train_dataset) == 0:
|
| 257 |
+
print('No training samples. Halt.')
|
| 258 |
+
return
|
| 259 |
+
|
| 260 |
+
if len(val_dataset) == 0:
|
| 261 |
+
print('No validation samples. Halt.')
|
| 262 |
+
return
|
| 263 |
+
|
| 264 |
+
stop = False
|
| 265 |
+
while not stop:
|
| 266 |
+
|
| 267 |
+
# Training
|
| 268 |
+
optimizer.zero_grad()
|
| 269 |
+
|
| 270 |
+
train_loss = train_num = 0
|
| 271 |
+
train_pred_list = []
|
| 272 |
+
train_labels_list = []
|
| 273 |
+
for train_batch in tqdm(train_loader, desc='Epoch {:03d}'.format(epoch), leave=False,
|
| 274 |
+
total=len(train_loader) // train_loader.batch_size):
|
| 275 |
+
net.train()
|
| 276 |
+
batch_data, batch_labels = train_batch
|
| 277 |
+
|
| 278 |
+
train_batch_num = len(batch_labels)
|
| 279 |
+
train_num += train_batch_num
|
| 280 |
+
train_labels_list.append(batch_labels.numpy().flatten())
|
| 281 |
+
|
| 282 |
+
train_batch_loss, train_batch_pred = batch_forward(net, device, criterion, batch_data, batch_labels)
|
| 283 |
+
train_pred_list.append(train_batch_pred.flatten())
|
| 284 |
+
|
| 285 |
+
if torch.isnan(train_batch_loss):
|
| 286 |
+
raise ValueError('NaN loss')
|
| 287 |
+
|
| 288 |
+
train_loss += train_batch_loss.item() * train_batch_num
|
| 289 |
+
|
| 290 |
+
# Optimization
|
| 291 |
+
train_batch_loss.backward()
|
| 292 |
+
optimizer.step()
|
| 293 |
+
optimizer.zero_grad()
|
| 294 |
+
|
| 295 |
+
# Logging
|
| 296 |
+
if iteration > 0 and (iteration % log_interval == 0):
|
| 297 |
+
train_loss /= train_num
|
| 298 |
+
tb.add_scalar('train/loss', train_loss, iteration)
|
| 299 |
+
tb.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration)
|
| 300 |
+
tb.add_scalar('epoch', epoch, iteration)
|
| 301 |
+
|
| 302 |
+
# Checkpoint
|
| 303 |
+
save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, last_path)
|
| 304 |
+
train_loss = train_num = 0
|
| 305 |
+
|
| 306 |
+
# Validation
|
| 307 |
+
if iteration > 0 and (iteration % validation_interval == 0):
|
| 308 |
+
|
| 309 |
+
# Model checkpoint
|
| 310 |
+
save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch,
|
| 311 |
+
periodic_path.format(iteration))
|
| 312 |
+
|
| 313 |
+
# Train cumulative stats
|
| 314 |
+
train_labels = np.concatenate(train_labels_list)
|
| 315 |
+
train_pred = np.concatenate(train_pred_list)
|
| 316 |
+
train_labels_list = []
|
| 317 |
+
train_pred_list = []
|
| 318 |
+
|
| 319 |
+
train_roc_auc = roc_auc_score(train_labels, train_pred)
|
| 320 |
+
tb.add_scalar('train/roc_auc', train_roc_auc, iteration)
|
| 321 |
+
tb.add_pr_curve('train/pr', train_labels, train_pred, iteration)
|
| 322 |
+
|
| 323 |
+
# Validation
|
| 324 |
+
val_loss = validation_routine(net, device, val_loader, criterion, tb, iteration, 'val')
|
| 325 |
+
tb.flush()
|
| 326 |
+
|
| 327 |
+
# LR Scheduler
|
| 328 |
+
lr_scheduler.step(val_loss)
|
| 329 |
+
|
| 330 |
+
# Model checkpoint
|
| 331 |
+
if val_loss < min_val_loss:
|
| 332 |
+
min_val_loss = val_loss
|
| 333 |
+
save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, bestval_path)
|
| 334 |
+
|
| 335 |
+
# Attention
|
| 336 |
+
if enable_attention and hasattr(net, 'get_attention'):
|
| 337 |
+
net.eval()
|
| 338 |
+
# For each dataframe show the attention for a real,fake couple of frames
|
| 339 |
+
for df, root, sample_idx, tag in [
|
| 340 |
+
(train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == False].index[0],
|
| 341 |
+
'train/att/real'),
|
| 342 |
+
(train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == True].index[0],
|
| 343 |
+
'train/att/fake'),
|
| 344 |
+
]:
|
| 345 |
+
record = df.loc[sample_idx]
|
| 346 |
+
tb_attention(tb, tag, iteration, net, device, face_size, face_policy,
|
| 347 |
+
transformer, root, record)
|
| 348 |
+
|
| 349 |
+
if optimizer.param_groups[0]['lr'] == min_lr:
|
| 350 |
+
print('Reached minimum learning rate. Stopping.')
|
| 351 |
+
stop = True
|
| 352 |
+
break
|
| 353 |
+
|
| 354 |
+
iteration += 1
|
| 355 |
+
|
| 356 |
+
if iteration > max_num_iterations:
|
| 357 |
+
print('Maximum number of iterations reached')
|
| 358 |
+
stop = True
|
| 359 |
+
break
|
| 360 |
+
|
| 361 |
+
# End of iteration
|
| 362 |
+
|
| 363 |
+
epoch += 1
|
| 364 |
+
|
| 365 |
+
# Needed to flush out last events
|
| 366 |
+
tb.close()
|
| 367 |
+
|
| 368 |
+
print('Completed')
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def tb_attention(tb: SummaryWriter,
|
| 372 |
+
tag: str,
|
| 373 |
+
iteration: int,
|
| 374 |
+
net: nn.Module,
|
| 375 |
+
device: torch.device,
|
| 376 |
+
patch_size_load: int,
|
| 377 |
+
face_crop_scale: str,
|
| 378 |
+
val_transformer: A.BasicTransform,
|
| 379 |
+
root: str,
|
| 380 |
+
record: pd.Series,
|
| 381 |
+
):
|
| 382 |
+
# Crop face
|
| 383 |
+
sample_t = load_face(record=record, root=root, size=patch_size_load, scale=face_crop_scale,
|
| 384 |
+
transformer=val_transformer)
|
| 385 |
+
sample_t_clean = load_face(record=record, root=root, size=patch_size_load, scale=face_crop_scale,
|
| 386 |
+
transformer=ToTensorV2())
|
| 387 |
+
if torch.cuda.is_available():
|
| 388 |
+
sample_t = sample_t.cuda(device)
|
| 389 |
+
# Transform
|
| 390 |
+
# Feed to net
|
| 391 |
+
with torch.no_grad():
|
| 392 |
+
att: torch.Tensor = net.get_attention(sample_t.unsqueeze(0))[0].cpu()
|
| 393 |
+
att_img: Image.Image = ToPILImage()(att)
|
| 394 |
+
sample_img = ToPILImage()(sample_t_clean)
|
| 395 |
+
att_img = att_img.resize(sample_img.size, resample=Image.NEAREST).convert('RGB')
|
| 396 |
+
sample_att_img = ImageChops.multiply(sample_img, att_img)
|
| 397 |
+
sample_att = ToTensor()(sample_att_img)
|
| 398 |
+
tb.add_image(tag=tag, img_tensor=sample_att, global_step=iteration)
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def batch_forward(net: nn.Module, device: torch.device, criterion, data: torch.Tensor, labels: torch.Tensor) -> (
|
| 402 |
+
torch.Tensor, float, int):
|
| 403 |
+
data = data.to(device)
|
| 404 |
+
labels = labels.to(device)
|
| 405 |
+
out = net(data)
|
| 406 |
+
pred = torch.sigmoid(out).detach().cpu().numpy()
|
| 407 |
+
loss = criterion(out, labels)
|
| 408 |
+
return loss, pred
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def validation_routine(net, device, val_loader, criterion, tb, iteration, tag: str, loader_len_norm: int = None):
|
| 412 |
+
net.eval()
|
| 413 |
+
loader_len_norm = loader_len_norm if loader_len_norm is not None else val_loader.batch_size
|
| 414 |
+
val_num = 0
|
| 415 |
+
val_loss = 0.
|
| 416 |
+
pred_list = list()
|
| 417 |
+
labels_list = list()
|
| 418 |
+
for val_data in tqdm(val_loader, desc='Validation', leave=False, total=len(val_loader) // loader_len_norm):
|
| 419 |
+
batch_data, batch_labels = val_data
|
| 420 |
+
|
| 421 |
+
val_batch_num = len(batch_labels)
|
| 422 |
+
labels_list.append(batch_labels.flatten())
|
| 423 |
+
with torch.no_grad():
|
| 424 |
+
val_batch_loss, val_batch_pred = batch_forward(net, device, criterion, batch_data,
|
| 425 |
+
batch_labels)
|
| 426 |
+
pred_list.append(val_batch_pred.flatten())
|
| 427 |
+
val_num += val_batch_num
|
| 428 |
+
val_loss += val_batch_loss.item() * val_batch_num
|
| 429 |
+
|
| 430 |
+
# Logging
|
| 431 |
+
val_loss /= val_num
|
| 432 |
+
tb.add_scalar('{}/loss'.format(tag), val_loss, iteration)
|
| 433 |
+
|
| 434 |
+
if isinstance(criterion, nn.BCEWithLogitsLoss):
|
| 435 |
+
val_labels = np.concatenate(labels_list)
|
| 436 |
+
val_pred = np.concatenate(pred_list)
|
| 437 |
+
val_roc_auc = roc_auc_score(val_labels, val_pred)
|
| 438 |
+
tb.add_scalar('{}/roc_auc'.format(tag), val_roc_auc, iteration)
|
| 439 |
+
tb.add_pr_curve('{}/pr'.format(tag), val_labels, val_pred, iteration)
|
| 440 |
+
|
| 441 |
+
return val_loss
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def save_model(net: nn.Module, optimizer: optim.Optimizer,
|
| 445 |
+
train_loss: float, val_loss: float,
|
| 446 |
+
iteration: int, batch_size: int, epoch: int,
|
| 447 |
+
path: str):
|
| 448 |
+
path = str(path)
|
| 449 |
+
state = dict(net=net.state_dict(),
|
| 450 |
+
opt=optimizer.state_dict(),
|
| 451 |
+
train_loss=train_loss,
|
| 452 |
+
val_loss=val_loss,
|
| 453 |
+
iteration=iteration,
|
| 454 |
+
batch_size=batch_size,
|
| 455 |
+
epoch=epoch)
|
| 456 |
+
torch.save(state, path)
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
if __name__ == '__main__':
|
| 460 |
+
main()
|
models/icpr2020dfdc/train_triplet.py
ADDED
|
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Face Manipulation Detection Through Ensemble of CNNs
|
| 3 |
+
|
| 4 |
+
Image and Sound Processing Lab - Politecnico di Milano
|
| 5 |
+
|
| 6 |
+
Nicolò Bonettini
|
| 7 |
+
Edoardo Daniele Cannas
|
| 8 |
+
Sara Mandelli
|
| 9 |
+
Luca Bondi
|
| 10 |
+
Paolo Bestagini
|
| 11 |
+
"""
|
| 12 |
+
import argparse
|
| 13 |
+
import os
|
| 14 |
+
import shutil
|
| 15 |
+
import warnings
|
| 16 |
+
|
| 17 |
+
import numpy as np
|
| 18 |
+
import torch
|
| 19 |
+
import torch.multiprocessing
|
| 20 |
+
|
| 21 |
+
torch.multiprocessing.set_sharing_strategy('file_system')
|
| 22 |
+
import torch.nn as nn
|
| 23 |
+
import torch.optim as optim
|
| 24 |
+
from tensorboardX import SummaryWriter
|
| 25 |
+
from torch.utils.data import DataLoader
|
| 26 |
+
from tqdm import tqdm
|
| 27 |
+
|
| 28 |
+
from architectures import tripletnet
|
| 29 |
+
from train_binclass import save_model, tb_attention
|
| 30 |
+
from isplutils.data import FrameFaceIterableDataset
|
| 31 |
+
from isplutils.data_siamese import FrameFaceTripletIterableDataset
|
| 32 |
+
from isplutils import split, utils
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def main():
|
| 36 |
+
# Args
|
| 37 |
+
parser = argparse.ArgumentParser()
|
| 38 |
+
parser.add_argument('--net', type=str, help='Net model class', required=True)
|
| 39 |
+
parser.add_argument('--traindb', type=str, help='Training datasets', nargs='+', choices=split.available_datasets,
|
| 40 |
+
required=True)
|
| 41 |
+
parser.add_argument('--valdb', type=str, help='Validation datasets', nargs='+', choices=split.available_datasets,
|
| 42 |
+
required=True)
|
| 43 |
+
parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
|
| 44 |
+
help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
|
| 45 |
+
'Required for training/validating on the DFDC dataset.')
|
| 46 |
+
parser.add_argument('--dfdc_faces_dir', type=str, action='store',
|
| 47 |
+
help='Path to the directory containing the faces extracted from the DFDC dataset. '
|
| 48 |
+
'Required for training/validating on the DFDC dataset.')
|
| 49 |
+
parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
|
| 50 |
+
help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
|
| 51 |
+
'Required for training/validating on the FF++ dataset.')
|
| 52 |
+
parser.add_argument('--ffpp_faces_dir', type=str, action='store',
|
| 53 |
+
help='Path to the directory containing the faces extracted from the FF++ dataset. '
|
| 54 |
+
'Required for training/validating on the FF++ dataset.')
|
| 55 |
+
parser.add_argument('--face', type=str, help='Face crop or scale', required=True,
|
| 56 |
+
choices=['scale', 'tight'])
|
| 57 |
+
parser.add_argument('--size', type=int, help='Train patch size', required=True)
|
| 58 |
+
|
| 59 |
+
parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=12)
|
| 60 |
+
parser.add_argument('--lr', type=float, default=1e-5, help='Learning rate')
|
| 61 |
+
parser.add_argument('--valint', type=int, help='Validation interval (iterations)', default=500)
|
| 62 |
+
parser.add_argument('--patience', type=int, help='Patience before dropping the LR [validation intervals]',
|
| 63 |
+
default=10)
|
| 64 |
+
parser.add_argument('--maxiter', type=int, help='Maximum number of iterations', default=20000)
|
| 65 |
+
parser.add_argument('--init', type=str, help='Weight initialization file')
|
| 66 |
+
parser.add_argument('--scratch', action='store_true', help='Train from scratch')
|
| 67 |
+
|
| 68 |
+
parser.add_argument('--traintriplets', type=int, help='Limit the number of train triplets per epoch', default=-1)
|
| 69 |
+
parser.add_argument('--valtriplets', type=int, help='Limit the number of validation triplets per epoch',
|
| 70 |
+
default=2000)
|
| 71 |
+
|
| 72 |
+
parser.add_argument('--logint', type=int, help='Training log interval (iterations)', default=100)
|
| 73 |
+
parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
|
| 74 |
+
parser.add_argument('--device', type=int, help='GPU device id', default=0)
|
| 75 |
+
parser.add_argument('--seed', type=int, help='Random seed', default=0)
|
| 76 |
+
|
| 77 |
+
parser.add_argument('--debug', action='store_true', help='Activate debug')
|
| 78 |
+
parser.add_argument('--suffix', type=str, help='Suffix to default tag')
|
| 79 |
+
|
| 80 |
+
parser.add_argument('--attention', action='store_true',
|
| 81 |
+
help='Enable Tensorboard log of attention masks')
|
| 82 |
+
parser.add_argument('--embedding', action='store_true', help='Activate embedding visualization in TensorBoard')
|
| 83 |
+
parser.add_argument('--embeddingint', type=int, help='Embedding visualization interval in TensorBoard',
|
| 84 |
+
default=5000)
|
| 85 |
+
|
| 86 |
+
parser.add_argument('--log_dir', type=str, help='Directory for saving the training logs',
|
| 87 |
+
default='runs/triplet/')
|
| 88 |
+
parser.add_argument('--models_dir', type=str, help='Directory for saving the models weights',
|
| 89 |
+
default='weights/triplet/')
|
| 90 |
+
|
| 91 |
+
args = parser.parse_args()
|
| 92 |
+
|
| 93 |
+
# Parse arguments
|
| 94 |
+
net_class = getattr(tripletnet, args.net)
|
| 95 |
+
train_datasets = args.traindb
|
| 96 |
+
val_datasets = args.valdb
|
| 97 |
+
dfdc_df_path = args.dfdc_faces_df_path
|
| 98 |
+
ffpp_df_path = args.ffpp_faces_df_path
|
| 99 |
+
dfdc_faces_dir = args.dfdc_faces_dir
|
| 100 |
+
ffpp_faces_dir = args.ffpp_faces_dir
|
| 101 |
+
face_policy = args.face
|
| 102 |
+
face_size = args.size
|
| 103 |
+
|
| 104 |
+
batch_size = args.batch
|
| 105 |
+
initial_lr = args.lr
|
| 106 |
+
validation_interval = args.valint
|
| 107 |
+
patience = args.patience
|
| 108 |
+
max_num_iterations = args.maxiter
|
| 109 |
+
initial_model = args.init
|
| 110 |
+
train_from_scratch = args.scratch
|
| 111 |
+
|
| 112 |
+
max_train_triplets = args.traintriplets
|
| 113 |
+
max_val_triplets = args.valtriplets
|
| 114 |
+
|
| 115 |
+
log_interval = args.logint
|
| 116 |
+
num_workers = args.workers
|
| 117 |
+
device = torch.device('cuda:{:d}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
|
| 118 |
+
seed = args.seed
|
| 119 |
+
|
| 120 |
+
debug = args.debug
|
| 121 |
+
suffix = args.suffix
|
| 122 |
+
|
| 123 |
+
enable_attention = args.attention
|
| 124 |
+
enable_embedding = args.embedding
|
| 125 |
+
embedding_interval = args.embeddingint
|
| 126 |
+
|
| 127 |
+
weights_folder = args.models_dir
|
| 128 |
+
logs_folder = args.log_dir
|
| 129 |
+
|
| 130 |
+
# Random initialization
|
| 131 |
+
np.random.seed(seed)
|
| 132 |
+
torch.random.manual_seed(seed)
|
| 133 |
+
|
| 134 |
+
# Load net
|
| 135 |
+
net: nn.Module = net_class().to(device)
|
| 136 |
+
|
| 137 |
+
# Loss and optimizers
|
| 138 |
+
criterion = nn.TripletMarginLoss()
|
| 139 |
+
|
| 140 |
+
min_lr = initial_lr * 1e-5
|
| 141 |
+
optimizer = optim.Adam(net.get_trainable_parameters(), lr=initial_lr)
|
| 142 |
+
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
|
| 143 |
+
optimizer=optimizer,
|
| 144 |
+
mode='min',
|
| 145 |
+
factor=0.1,
|
| 146 |
+
patience=patience,
|
| 147 |
+
cooldown=2 * patience,
|
| 148 |
+
min_lr=min_lr,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
tag = utils.make_train_tag(net_class=net_class,
|
| 152 |
+
traindb=train_datasets,
|
| 153 |
+
face_policy=face_policy,
|
| 154 |
+
patch_size=face_size,
|
| 155 |
+
seed=seed,
|
| 156 |
+
suffix=suffix,
|
| 157 |
+
debug=debug,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# Model checkpoint paths
|
| 161 |
+
bestval_path = os.path.join(weights_folder, tag, 'bestval.pth')
|
| 162 |
+
last_path = os.path.join(weights_folder, tag, 'last.pth')
|
| 163 |
+
periodic_path = os.path.join(weights_folder, tag, 'it{:06d}.pth')
|
| 164 |
+
|
| 165 |
+
os.makedirs(os.path.join(weights_folder, tag), exist_ok=True)
|
| 166 |
+
|
| 167 |
+
# Load model
|
| 168 |
+
val_loss = min_val_loss = 20
|
| 169 |
+
epoch = iteration = 0
|
| 170 |
+
net_state = None
|
| 171 |
+
opt_state = None
|
| 172 |
+
if initial_model is not None:
|
| 173 |
+
# If given load initial model
|
| 174 |
+
print('Loading model form: {}'.format(initial_model))
|
| 175 |
+
state = torch.load(initial_model, map_location='cpu')
|
| 176 |
+
net_state = state['net']
|
| 177 |
+
elif not train_from_scratch and os.path.exists(last_path):
|
| 178 |
+
print('Loading model form: {}'.format(last_path))
|
| 179 |
+
state = torch.load(last_path, map_location='cpu')
|
| 180 |
+
net_state = state['net']
|
| 181 |
+
opt_state = state['opt']
|
| 182 |
+
iteration = state['iteration'] + 1
|
| 183 |
+
epoch = state['epoch']
|
| 184 |
+
if not train_from_scratch and os.path.exists(bestval_path):
|
| 185 |
+
state = torch.load(bestval_path, map_location='cpu')
|
| 186 |
+
min_val_loss = state['val_loss']
|
| 187 |
+
if net_state is not None:
|
| 188 |
+
adapt_binclass_model(net_state)
|
| 189 |
+
incomp_keys = net.load_state_dict(net_state, strict=False)
|
| 190 |
+
print(incomp_keys)
|
| 191 |
+
if opt_state is not None:
|
| 192 |
+
for param_group in opt_state['param_groups']:
|
| 193 |
+
param_group['lr'] = initial_lr
|
| 194 |
+
optimizer.load_state_dict(opt_state)
|
| 195 |
+
|
| 196 |
+
# Initialize Tensorboard
|
| 197 |
+
logdir = os.path.join(logs_folder, tag)
|
| 198 |
+
if iteration == 0:
|
| 199 |
+
# If training from scratch or initialization remove history if exists
|
| 200 |
+
shutil.rmtree(logdir, ignore_errors=True)
|
| 201 |
+
|
| 202 |
+
# TensorboardX instance
|
| 203 |
+
tb = SummaryWriter(logdir=logdir)
|
| 204 |
+
if iteration == 0:
|
| 205 |
+
dummy = torch.randn((1, 3, face_size, face_size), device=device)
|
| 206 |
+
with warnings.catch_warnings():
|
| 207 |
+
warnings.simplefilter("ignore")
|
| 208 |
+
tb.add_graph(net, [dummy, dummy, dummy], verbose=False)
|
| 209 |
+
|
| 210 |
+
transformer = utils.get_transformer(face_policy=face_policy, patch_size=face_size,
|
| 211 |
+
net_normalizer=net.get_normalizer(), train=True)
|
| 212 |
+
|
| 213 |
+
# Datasets and data loaders
|
| 214 |
+
print('Loading data')
|
| 215 |
+
# Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
|
| 216 |
+
for dataset in train_datasets:
|
| 217 |
+
if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
|
| 218 |
+
raise RuntimeError('Specify DataFrame and directory for DFDC faces for training!')
|
| 219 |
+
elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
|
| 220 |
+
raise RuntimeError('Specify DataFrame and directory for FF++ faces for training!')
|
| 221 |
+
for dataset in val_datasets:
|
| 222 |
+
if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
|
| 223 |
+
raise RuntimeError('Specify DataFrame and directory for DFDC faces for validation!')
|
| 224 |
+
elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
|
| 225 |
+
raise RuntimeError('Specify DataFrame and directory for FF++ faces for validation!')
|
| 226 |
+
splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir,
|
| 227 |
+
ffpp_dir=ffpp_faces_dir, dbs={'train': train_datasets, 'val': val_datasets})
|
| 228 |
+
train_dfs = [splits['train'][db][0] for db in splits['train']]
|
| 229 |
+
train_roots = [splits['train'][db][1] for db in splits['train']]
|
| 230 |
+
val_roots = [splits['val'][db][1] for db in splits['val']]
|
| 231 |
+
val_dfs = [splits['val'][db][0] for db in splits['val']]
|
| 232 |
+
|
| 233 |
+
train_dataset = FrameFaceTripletIterableDataset(roots=train_roots,
|
| 234 |
+
dfs=train_dfs,
|
| 235 |
+
scale=face_policy,
|
| 236 |
+
num_triplets=max_train_triplets,
|
| 237 |
+
transformer=transformer,
|
| 238 |
+
size=face_size,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
val_dataset = FrameFaceTripletIterableDataset(roots=val_roots,
|
| 242 |
+
dfs=val_dfs,
|
| 243 |
+
scale=face_policy,
|
| 244 |
+
num_triplets=max_val_triplets,
|
| 245 |
+
transformer=transformer,
|
| 246 |
+
size=face_size,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
train_loader = DataLoader(train_dataset, num_workers=num_workers, batch_size=batch_size, )
|
| 250 |
+
|
| 251 |
+
val_loader = DataLoader(val_dataset, num_workers=num_workers, batch_size=batch_size, )
|
| 252 |
+
|
| 253 |
+
print('Training triplets: {}'.format(len(train_dataset)))
|
| 254 |
+
print('Validation triplets: {}'.format(len(val_dataset)))
|
| 255 |
+
|
| 256 |
+
if len(train_dataset) == 0:
|
| 257 |
+
print('No training triplets. Halt.')
|
| 258 |
+
return
|
| 259 |
+
|
| 260 |
+
if len(val_dataset) == 0:
|
| 261 |
+
print('No validation triplets. Halt.')
|
| 262 |
+
return
|
| 263 |
+
|
| 264 |
+
# Embedding visualization
|
| 265 |
+
if enable_embedding:
|
| 266 |
+
train_dataset_embedding = FrameFaceIterableDataset(roots=train_roots,
|
| 267 |
+
dfs=train_dfs,
|
| 268 |
+
scale=face_policy,
|
| 269 |
+
num_samples=64,
|
| 270 |
+
transformer=transformer,
|
| 271 |
+
size=face_size,
|
| 272 |
+
)
|
| 273 |
+
train_loader_embedding = DataLoader(train_dataset_embedding, num_workers=num_workers, batch_size=batch_size, )
|
| 274 |
+
val_dataset_embedding = FrameFaceIterableDataset(roots=val_roots,
|
| 275 |
+
dfs=val_dfs,
|
| 276 |
+
scale=face_policy,
|
| 277 |
+
num_samples=64,
|
| 278 |
+
transformer=transformer,
|
| 279 |
+
size=face_size,
|
| 280 |
+
)
|
| 281 |
+
val_loader_embedding = DataLoader(val_dataset_embedding, num_workers=num_workers, batch_size=batch_size, )
|
| 282 |
+
|
| 283 |
+
else:
|
| 284 |
+
train_loader_embedding = None
|
| 285 |
+
val_loader_embedding = None
|
| 286 |
+
|
| 287 |
+
stop = False
|
| 288 |
+
while not stop:
|
| 289 |
+
|
| 290 |
+
# Training
|
| 291 |
+
optimizer.zero_grad()
|
| 292 |
+
|
| 293 |
+
train_loss = train_num = 0
|
| 294 |
+
for train_batch in tqdm(train_loader, desc='Epoch {:03d}'.format(epoch), leave=False,
|
| 295 |
+
total=len(train_loader) // train_loader.batch_size):
|
| 296 |
+
net.train()
|
| 297 |
+
train_batch_num = len(train_batch[0])
|
| 298 |
+
train_num += train_batch_num
|
| 299 |
+
|
| 300 |
+
train_batch_loss = batch_forward(net, device, criterion, train_batch)
|
| 301 |
+
|
| 302 |
+
if torch.isnan(train_batch_loss):
|
| 303 |
+
raise ValueError('NaN loss')
|
| 304 |
+
|
| 305 |
+
train_loss += train_batch_loss.item() * train_batch_num
|
| 306 |
+
|
| 307 |
+
# Optimization
|
| 308 |
+
train_batch_loss.backward()
|
| 309 |
+
optimizer.step()
|
| 310 |
+
optimizer.zero_grad()
|
| 311 |
+
|
| 312 |
+
# Logging
|
| 313 |
+
if iteration > 0 and (iteration % log_interval == 0):
|
| 314 |
+
train_loss /= train_num
|
| 315 |
+
tb.add_scalar('train/loss', train_loss, iteration)
|
| 316 |
+
tb.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration)
|
| 317 |
+
tb.add_scalar('epoch', epoch, iteration)
|
| 318 |
+
|
| 319 |
+
# Checkpoint
|
| 320 |
+
save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, last_path)
|
| 321 |
+
train_loss = train_num = 0
|
| 322 |
+
|
| 323 |
+
# Validation
|
| 324 |
+
if iteration > 0 and (iteration % validation_interval == 0):
|
| 325 |
+
|
| 326 |
+
# Validation
|
| 327 |
+
val_loss = validation_routine(net, device, val_loader, criterion, tb, iteration, tag='val')
|
| 328 |
+
tb.flush()
|
| 329 |
+
|
| 330 |
+
# LR Scheduler
|
| 331 |
+
lr_scheduler.step(val_loss)
|
| 332 |
+
|
| 333 |
+
# Model checkpoint
|
| 334 |
+
save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch,
|
| 335 |
+
periodic_path.format(iteration))
|
| 336 |
+
if val_loss < min_val_loss:
|
| 337 |
+
min_val_loss = val_loss
|
| 338 |
+
shutil.copy(periodic_path.format(iteration), bestval_path)
|
| 339 |
+
|
| 340 |
+
# Attention
|
| 341 |
+
if enable_attention and hasattr(net, 'feat_ext') and hasattr(net.feat_ext, 'get_attention'):
|
| 342 |
+
net.eval()
|
| 343 |
+
# For each dataframe show the attention for a real,fake couple of frames
|
| 344 |
+
|
| 345 |
+
for df, root, sample_idx, tag in [
|
| 346 |
+
(train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == False].index[0],
|
| 347 |
+
'train/att/real'),
|
| 348 |
+
(train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == True].index[0],
|
| 349 |
+
'train/att/fake'),
|
| 350 |
+
]:
|
| 351 |
+
record = df.loc[sample_idx]
|
| 352 |
+
tb_attention(tb, tag, iteration, net.feat_ext, device, face_size, face_policy,
|
| 353 |
+
transformer, root, record)
|
| 354 |
+
|
| 355 |
+
if optimizer.param_groups[0]['lr'] <= min_lr:
|
| 356 |
+
print('Reached minimum learning rate. Stopping.')
|
| 357 |
+
stop = True
|
| 358 |
+
break
|
| 359 |
+
|
| 360 |
+
# Embedding visualization
|
| 361 |
+
if enable_embedding:
|
| 362 |
+
if iteration > 0 and (iteration % embedding_interval == 0):
|
| 363 |
+
embedding_routine(net=net,
|
| 364 |
+
device=device,
|
| 365 |
+
loader=train_loader_embedding,
|
| 366 |
+
iteration=iteration,
|
| 367 |
+
tb=tb,
|
| 368 |
+
tag=tag + '/train')
|
| 369 |
+
embedding_routine(net=net,
|
| 370 |
+
device=device,
|
| 371 |
+
loader=val_loader_embedding,
|
| 372 |
+
iteration=iteration,
|
| 373 |
+
tb=tb,
|
| 374 |
+
tag=tag + '/val')
|
| 375 |
+
|
| 376 |
+
iteration += 1
|
| 377 |
+
|
| 378 |
+
if iteration > max_num_iterations:
|
| 379 |
+
print('Maximum number of iterations reached')
|
| 380 |
+
stop = True
|
| 381 |
+
break
|
| 382 |
+
|
| 383 |
+
# End of iteration
|
| 384 |
+
|
| 385 |
+
epoch += 1
|
| 386 |
+
|
| 387 |
+
# Needed to flush out last events
|
| 388 |
+
tb.close()
|
| 389 |
+
|
| 390 |
+
print('Completed')
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
def adapt_binclass_model(net_state):
|
| 394 |
+
# Check that the model contains at least one key starting with feat_ext, otherwise adapt
|
| 395 |
+
found = False
|
| 396 |
+
for key in net_state:
|
| 397 |
+
if key.startswith('feat_ext.'):
|
| 398 |
+
found = True
|
| 399 |
+
break
|
| 400 |
+
if not found:
|
| 401 |
+
# Adapt all keys
|
| 402 |
+
print('Adapting keys')
|
| 403 |
+
keys = [k for k in net_state]
|
| 404 |
+
for key in keys:
|
| 405 |
+
net_state['feat_ext.{}'.format(key)] = net_state[key]
|
| 406 |
+
del net_state[key]
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
def batch_forward(net: nn.Module, device, criterion, data: tuple) -> torch.Tensor:
|
| 410 |
+
if torch.cuda.is_available():
|
| 411 |
+
data = [i.cuda(device) for i in data]
|
| 412 |
+
out = net(*data)
|
| 413 |
+
loss = criterion(*out)
|
| 414 |
+
return loss
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
def validation_routine(net, device, val_loader, criterion, tb, iteration, tag):
|
| 418 |
+
net.eval()
|
| 419 |
+
|
| 420 |
+
val_num = 0
|
| 421 |
+
val_loss = 0.
|
| 422 |
+
for val_data in tqdm(val_loader, desc='Validation', leave=False, total=len(val_loader) // val_loader.batch_size):
|
| 423 |
+
val_batch_num = len(val_data[0])
|
| 424 |
+
with torch.no_grad():
|
| 425 |
+
val_batch_loss = batch_forward(net, device, criterion, val_data, )
|
| 426 |
+
val_num += val_batch_num
|
| 427 |
+
val_loss += val_batch_loss.item() * val_batch_num
|
| 428 |
+
|
| 429 |
+
# Logging
|
| 430 |
+
val_loss /= val_num
|
| 431 |
+
tb.add_scalar('{}/loss'.format(tag), val_loss, iteration)
|
| 432 |
+
|
| 433 |
+
return val_loss
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
def embedding_routine(net: nn.Module, device: torch.device, loader: DataLoader, tb: SummaryWriter, iteration: int,
|
| 437 |
+
tag: str):
|
| 438 |
+
net.eval()
|
| 439 |
+
|
| 440 |
+
labels = []
|
| 441 |
+
embeddings = []
|
| 442 |
+
for batch_data in loader:
|
| 443 |
+
batch_faces, batch_labels = batch_data
|
| 444 |
+
if torch.cuda.is_available():
|
| 445 |
+
batch_faces = batch_faces.to(device)
|
| 446 |
+
with torch.no_grad():
|
| 447 |
+
batch_emb = net.features(batch_faces)
|
| 448 |
+
labels.append(batch_labels.numpy().flatten())
|
| 449 |
+
embeddings.append(torch.flatten(batch_emb.cpu(), start_dim=1).numpy())
|
| 450 |
+
|
| 451 |
+
labels = list(np.concatenate(labels))
|
| 452 |
+
embeddings = np.concatenate(embeddings)
|
| 453 |
+
|
| 454 |
+
# Logging
|
| 455 |
+
tb.add_embedding(mat=embeddings, metadata=labels, tag=tag, global_step=iteration)
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
if __name__ == '__main__':
|
| 459 |
+
main()
|
models/model_loader.py
CHANGED
|
@@ -27,6 +27,7 @@ class ModelLoader:
|
|
| 27 |
cls._instance._face_detector = None
|
| 28 |
cls._instance._spacy_nlp = None
|
| 29 |
cls._instance._sentence_transformer = None
|
|
|
|
| 30 |
return cls._instance
|
| 31 |
|
| 32 |
@classmethod
|
|
@@ -146,6 +147,23 @@ class ModelLoader:
|
|
| 146 |
logger.info("MediaPipe FaceMesh loaded")
|
| 147 |
return self._face_detector
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
# ---------- Preload ----------
|
| 150 |
def preload_phase1(self) -> None:
|
| 151 |
"""Preload only what Phase 1 needs (image model)."""
|
|
|
|
| 27 |
cls._instance._face_detector = None
|
| 28 |
cls._instance._spacy_nlp = None
|
| 29 |
cls._instance._sentence_transformer = None
|
| 30 |
+
cls._instance._efficientnet_detector = None
|
| 31 |
return cls._instance
|
| 32 |
|
| 33 |
@classmethod
|
|
|
|
| 147 |
logger.info("MediaPipe FaceMesh loaded")
|
| 148 |
return self._face_detector
|
| 149 |
|
| 150 |
+
# ---------- EfficientNetAutoAttB4 (ICPR2020 / DeepShield1 merge) ----------
|
| 151 |
+
def load_efficientnet(self):
|
| 152 |
+
"""Lazy-load EfficientNetAutoAttB4 detector. Returns None if deps are missing."""
|
| 153 |
+
if self._efficientnet_detector is None:
|
| 154 |
+
try:
|
| 155 |
+
from services.efficientnet_service import EfficientNetDetector
|
| 156 |
+
|
| 157 |
+
self._efficientnet_detector = EfficientNetDetector(
|
| 158 |
+
model_name=settings.EFFICIENTNET_MODEL,
|
| 159 |
+
train_db=settings.EFFICIENTNET_TRAIN_DB,
|
| 160 |
+
device=settings.DEVICE,
|
| 161 |
+
)
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.warning(f"EfficientNet load failed (continuing without it): {e}")
|
| 164 |
+
return None
|
| 165 |
+
return self._efficientnet_detector
|
| 166 |
+
|
| 167 |
# ---------- Preload ----------
|
| 168 |
def preload_phase1(self) -> None:
|
| 169 |
"""Preload only what Phase 1 needs (image model)."""
|
requirements.txt
CHANGED
|
@@ -11,6 +11,13 @@ alembic==1.13.3
|
|
| 11 |
python-jose[cryptography]==3.3.0
|
| 12 |
bcrypt==4.2.0
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# === Phase 1: Image Detection ===
|
| 15 |
# Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
|
| 16 |
torch==2.4.1
|
|
|
|
| 11 |
python-jose[cryptography]==3.3.0
|
| 12 |
bcrypt==4.2.0
|
| 13 |
|
| 14 |
+
# === MERGE: EfficientNetAutoAttB4 (DeepShield1 / ICPR2020) ===
|
| 15 |
+
albumentations>=1.3.0,<1.5 # Required by icpr2020dfdc isplutils transforms; pin to avoid 1.5+ API break
|
| 16 |
+
scipy>=1.13.0 # expit (sigmoid) for EfficientNet logit conversion
|
| 17 |
+
# NOTE: MERGE_PLAN §4 said NOT to install efficientnet-pytorch, but fornet.py imports it directly.
|
| 18 |
+
efficientnet-pytorch==0.7.1 # Required by icpr2020dfdc/architectures/fornet.py
|
| 19 |
+
psutil>=5.9.0 # RAM monitoring in smoke tests
|
| 20 |
+
|
| 21 |
# === Phase 1: Image Detection ===
|
| 22 |
# Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
|
| 23 |
torch==2.4.1
|
schemas/common.py
CHANGED
|
@@ -86,3 +86,4 @@ class ProcessingSummary(BaseModel):
|
|
| 86 |
stages_completed: List[str]
|
| 87 |
total_duration_ms: int
|
| 88 |
model_used: str
|
|
|
|
|
|
| 86 |
stages_completed: List[str]
|
| 87 |
total_duration_ms: int
|
| 88 |
model_used: str
|
| 89 |
+
models_used: List[str] = [] # all models that contributed (ensemble)
|
services/efficientnet_service.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""EfficientNetAutoAttB4 adapter — wraps ICPR2020 DFDC model into DeepShield service interface."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import pickle
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torch
|
| 11 |
+
from loguru import logger
|
| 12 |
+
from PIL import Image
|
| 13 |
+
from scipy.special import expit
|
| 14 |
+
from torch.utils.model_zoo import load_url
|
| 15 |
+
|
| 16 |
+
# Resolve ICPR2020 repo root and patch sys.path so its modules are importable.
|
| 17 |
+
_ICPR_ROOT = (Path(__file__).resolve().parent.parent / "models" / "icpr2020dfdc").resolve()
|
| 18 |
+
_NOTEBOOK_DIR = str(_ICPR_ROOT / "notebook")
|
| 19 |
+
if str(_ICPR_ROOT) not in sys.path:
|
| 20 |
+
sys.path.insert(0, str(_ICPR_ROOT))
|
| 21 |
+
if _NOTEBOOK_DIR not in sys.path:
|
| 22 |
+
sys.path.insert(0, _NOTEBOOK_DIR)
|
| 23 |
+
|
| 24 |
+
# These imports are valid only after the sys.path patch above.
|
| 25 |
+
from blazeface import BlazeFace, FaceExtractor # noqa: E402
|
| 26 |
+
from architectures import fornet, weights # noqa: E402
|
| 27 |
+
from isplutils import utils as ispl_utils # noqa: E402
|
| 28 |
+
|
| 29 |
+
# Default calibrator path — populated by scripts/fit_calibrator.py.
|
| 30 |
+
_CALIBRATOR_PATH = Path(__file__).resolve().parent.parent / "models" / "efficientnet_calibrator.pkl"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _load_calibrator(path: Path = _CALIBRATOR_PATH):
|
| 34 |
+
"""Load isotonic calibrator if it exists. Returns None otherwise."""
|
| 35 |
+
if not path.exists():
|
| 36 |
+
return None
|
| 37 |
+
try:
|
| 38 |
+
with path.open("rb") as f:
|
| 39 |
+
cal = pickle.load(f)
|
| 40 |
+
logger.info(f"Isotonic calibrator loaded from {path}")
|
| 41 |
+
return cal
|
| 42 |
+
except Exception as e:
|
| 43 |
+
logger.warning(f"Failed to load calibrator ({e}) — using raw sigmoid scores")
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class EfficientNetDetector:
|
| 48 |
+
"""Thin adapter that loads EfficientNetAutoAttB4 (DFDC-trained) and exposes
|
| 49 |
+
detect_image() / detect_video_frames() matching DeepShield's service interface.
|
| 50 |
+
|
| 51 |
+
If backend/models/efficientnet_calibrator.pkl exists (produced by
|
| 52 |
+
scripts/fit_calibrator.py), raw sigmoid scores are passed through an isotonic
|
| 53 |
+
regression calibrator before being returned. Set calibrator=None to disable.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def __init__(
|
| 57 |
+
self,
|
| 58 |
+
model_name: str = "EfficientNetAutoAttB4",
|
| 59 |
+
train_db: str = "DFDC",
|
| 60 |
+
device: str = "cpu",
|
| 61 |
+
calibrator_path: Optional[Path] = None,
|
| 62 |
+
) -> None:
|
| 63 |
+
self.device = torch.device(device)
|
| 64 |
+
self.model_name = model_name
|
| 65 |
+
self.train_db = train_db
|
| 66 |
+
|
| 67 |
+
weight_key = f"{model_name}_{train_db}"
|
| 68 |
+
if weight_key not in weights.weight_url:
|
| 69 |
+
raise KeyError(f"Unknown model/DB combination: {weight_key}")
|
| 70 |
+
|
| 71 |
+
self.net = getattr(fornet, model_name)().eval().to(self.device)
|
| 72 |
+
# check_hash=False — the ISPL mirror occasionally has stale sha256 hashes in URLs.
|
| 73 |
+
state = load_url(weights.weight_url[weight_key], map_location=self.device, check_hash=False)
|
| 74 |
+
self.net.load_state_dict(state)
|
| 75 |
+
|
| 76 |
+
self.transf = ispl_utils.get_transformer(
|
| 77 |
+
"scale", 224, self.net.get_normalizer(), train=False
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
blazeface_dir = _ICPR_ROOT / "blazeface"
|
| 81 |
+
weights_path = blazeface_dir / "blazeface.pth"
|
| 82 |
+
anchors_path = blazeface_dir / "anchors.npy"
|
| 83 |
+
if not weights_path.exists() or not anchors_path.exists():
|
| 84 |
+
raise FileNotFoundError(
|
| 85 |
+
f"BlazeFace assets missing: expected {weights_path} and {anchors_path}. "
|
| 86 |
+
"Ensure icpr2020dfdc is cloned into backend/models/ with its blazeface/ subdirectory."
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
self.facedet = BlazeFace().to(self.device)
|
| 90 |
+
self.facedet.load_weights(str(weights_path))
|
| 91 |
+
self.facedet.load_anchors(str(anchors_path))
|
| 92 |
+
self.face_extractor = FaceExtractor(facedet=self.facedet)
|
| 93 |
+
|
| 94 |
+
self.calibrator = _load_calibrator(calibrator_path or _CALIBRATOR_PATH)
|
| 95 |
+
self.calibrator_applied = self.calibrator is not None
|
| 96 |
+
|
| 97 |
+
logger.info(
|
| 98 |
+
f"EfficientNetDetector ready: {model_name}/{train_db} on {self.device} "
|
| 99 |
+
f"| calibrator={'yes' if self.calibrator_applied else 'no'}"
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
def _face_tensor(self, face_np: np.ndarray) -> torch.Tensor:
|
| 103 |
+
"""Apply albumentations transform to a cropped face array and return a CHW tensor."""
|
| 104 |
+
result = self.transf(image=face_np)
|
| 105 |
+
return result["image"]
|
| 106 |
+
|
| 107 |
+
def _calibrate(self, score: float) -> float:
|
| 108 |
+
"""Apply isotonic calibration if available; otherwise return score unchanged."""
|
| 109 |
+
if self.calibrator is None:
|
| 110 |
+
return score
|
| 111 |
+
try:
|
| 112 |
+
return float(self.calibrator.predict([[score]])[0])
|
| 113 |
+
except Exception:
|
| 114 |
+
return score
|
| 115 |
+
|
| 116 |
+
def _calibrate_batch(self, scores: np.ndarray) -> np.ndarray:
|
| 117 |
+
"""Apply isotonic calibration to a 1-D array of scores."""
|
| 118 |
+
if self.calibrator is None:
|
| 119 |
+
return scores
|
| 120 |
+
try:
|
| 121 |
+
return self.calibrator.predict(scores.reshape(-1, 1)).flatten()
|
| 122 |
+
except Exception:
|
| 123 |
+
return scores
|
| 124 |
+
|
| 125 |
+
def raw_logit(self, face_tensor: torch.Tensor) -> float:
|
| 126 |
+
"""Return raw logit for a single face tensor — used by fit_calibrator.py."""
|
| 127 |
+
with torch.inference_mode():
|
| 128 |
+
return float(self.net(face_tensor.unsqueeze(0).to(self.device)).item())
|
| 129 |
+
|
| 130 |
+
def detect_image(self, pil_image: Image.Image) -> dict:
|
| 131 |
+
"""Run EfficientNet on a single PIL image.
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
{"score": float|None, "result": "FAKE"|"REAL"|None, "model": str,
|
| 135 |
+
"error": str|None, "calibrator_applied": bool}
|
| 136 |
+
"""
|
| 137 |
+
if pil_image.mode != "RGB":
|
| 138 |
+
pil_image = pil_image.convert("RGB")
|
| 139 |
+
img_array = np.array(pil_image)
|
| 140 |
+
|
| 141 |
+
frame_data = self.face_extractor.process_image(img=img_array)
|
| 142 |
+
faces: list = frame_data.get("faces", [])
|
| 143 |
+
if not faces:
|
| 144 |
+
logger.debug("EfficientNetDetector.detect_image: no face detected")
|
| 145 |
+
return {
|
| 146 |
+
"error": "no_face",
|
| 147 |
+
"score": None,
|
| 148 |
+
"result": None,
|
| 149 |
+
"model": f"{self.model_name}_{self.train_db}",
|
| 150 |
+
"calibrator_applied": False,
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
face_t = self._face_tensor(faces[0])
|
| 154 |
+
with torch.inference_mode():
|
| 155 |
+
logit = self.net(face_t.unsqueeze(0).to(self.device))
|
| 156 |
+
raw_score = float(torch.sigmoid(logit).item())
|
| 157 |
+
|
| 158 |
+
score = self._calibrate(raw_score)
|
| 159 |
+
return {
|
| 160 |
+
"score": score,
|
| 161 |
+
"result": "FAKE" if score > 0.5 else "REAL",
|
| 162 |
+
"model": f"{self.model_name}_{self.train_db}",
|
| 163 |
+
"error": None,
|
| 164 |
+
"calibrator_applied": self.calibrator_applied,
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
def detect_video_frames(self, frames: List[np.ndarray]) -> dict:
|
| 168 |
+
"""Run EfficientNet on a list of BGR/RGB numpy frames (as extracted by OpenCV).
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
{"mean_score": float|None, "per_frame": list[float], "model": str,
|
| 172 |
+
"error": str|None, "calibrator_applied": bool}
|
| 173 |
+
"""
|
| 174 |
+
face_tensors: list[torch.Tensor] = []
|
| 175 |
+
for frame in frames:
|
| 176 |
+
# Ensure RGB — OpenCV yields BGR, PIL already RGB.
|
| 177 |
+
if frame.ndim == 3 and frame.shape[2] == 3:
|
| 178 |
+
frame_rgb = frame[..., ::-1].copy() if frame.dtype == np.uint8 else frame
|
| 179 |
+
else:
|
| 180 |
+
frame_rgb = frame
|
| 181 |
+
frame_data = self.face_extractor.process_image(img=frame_rgb)
|
| 182 |
+
faces: list = frame_data.get("faces", [])
|
| 183 |
+
if faces:
|
| 184 |
+
face_tensors.append(self._face_tensor(faces[0]))
|
| 185 |
+
|
| 186 |
+
if not face_tensors:
|
| 187 |
+
logger.debug("EfficientNetDetector.detect_video_frames: no faces in any frame")
|
| 188 |
+
return {
|
| 189 |
+
"error": "no_faces",
|
| 190 |
+
"mean_score": None,
|
| 191 |
+
"per_frame": [],
|
| 192 |
+
"model": f"{self.model_name}_{self.train_db}",
|
| 193 |
+
"calibrator_applied": False,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
batch = torch.stack(face_tensors).to(self.device)
|
| 197 |
+
with torch.inference_mode():
|
| 198 |
+
logits = self.net(batch).cpu().numpy().flatten()
|
| 199 |
+
|
| 200 |
+
raw_per_frame = expit(logits)
|
| 201 |
+
per_frame = self._calibrate_batch(raw_per_frame).tolist()
|
| 202 |
+
mean_score = float(self._calibrate(float(expit(np.mean(logits)))))
|
| 203 |
+
return {
|
| 204 |
+
"mean_score": mean_score,
|
| 205 |
+
"per_frame": per_frame,
|
| 206 |
+
"model": f"{self.model_name}_{self.train_db}",
|
| 207 |
+
"error": None,
|
| 208 |
+
"calibrator_applied": self.calibrator_applied,
|
| 209 |
+
}
|
services/image_service.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import io
|
| 4 |
-
from dataclasses import dataclass
|
| 5 |
-
from typing import Tuple
|
| 6 |
|
| 7 |
import torch
|
| 8 |
from loguru import logger
|
|
@@ -17,6 +17,8 @@ class ImageClassification:
|
|
| 17 |
label: str
|
| 18 |
confidence: float
|
| 19 |
all_scores: dict[str, float]
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
def load_image_from_bytes(data: bytes) -> Image.Image:
|
|
@@ -26,8 +28,8 @@ def load_image_from_bytes(data: bytes) -> Image.Image:
|
|
| 26 |
return img
|
| 27 |
|
| 28 |
|
| 29 |
-
def
|
| 30 |
-
"""Run the ViT deepfake classifier
|
| 31 |
loader = get_model_loader()
|
| 32 |
model, processor = loader.load_image_model()
|
| 33 |
|
|
@@ -36,17 +38,88 @@ def classify_image(pil_img: Image.Image) -> ImageClassification:
|
|
| 36 |
|
| 37 |
with torch.no_grad():
|
| 38 |
outputs = model(**inputs)
|
| 39 |
-
logits = outputs.logits
|
| 40 |
probs = torch.softmax(logits, dim=-1)[0]
|
| 41 |
|
| 42 |
id2label: dict[int, str] = getattr(model.config, "id2label", {})
|
| 43 |
all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
|
| 44 |
top_idx = int(torch.argmax(probs).item())
|
| 45 |
top_label = id2label.get(top_idx, str(top_idx))
|
| 46 |
-
top_conf = float(probs[top_idx].item())
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import io
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
from typing import List, Optional, Tuple
|
| 6 |
|
| 7 |
import torch
|
| 8 |
from loguru import logger
|
|
|
|
| 17 |
label: str
|
| 18 |
confidence: float
|
| 19 |
all_scores: dict[str, float]
|
| 20 |
+
models_used: List[str] = field(default_factory=list)
|
| 21 |
+
ensemble_method: Optional[str] = None
|
| 22 |
|
| 23 |
|
| 24 |
def load_image_from_bytes(data: bytes) -> Image.Image:
|
|
|
|
| 28 |
return img
|
| 29 |
|
| 30 |
|
| 31 |
+
def _classify_vit(pil_img: Image.Image) -> Tuple[float, str, dict[str, float]]:
|
| 32 |
+
"""Run the ViT deepfake classifier. Returns (fake_prob, top_label, all_scores)."""
|
| 33 |
loader = get_model_loader()
|
| 34 |
model, processor = loader.load_image_model()
|
| 35 |
|
|
|
|
| 38 |
|
| 39 |
with torch.no_grad():
|
| 40 |
outputs = model(**inputs)
|
| 41 |
+
logits = outputs.logits
|
| 42 |
probs = torch.softmax(logits, dim=-1)[0]
|
| 43 |
|
| 44 |
id2label: dict[int, str] = getattr(model.config, "id2label", {})
|
| 45 |
all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
|
| 46 |
top_idx = int(torch.argmax(probs).item())
|
| 47 |
top_label = id2label.get(top_idx, str(top_idx))
|
|
|
|
| 48 |
|
| 49 |
+
# Identify the fake probability — pick the highest score from fake-labelled classes.
|
| 50 |
+
fake_tokens = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
|
| 51 |
+
fake_prob = max(
|
| 52 |
+
(float(p) for lbl, p in all_scores.items() if any(t in lbl.lower() for t in fake_tokens)),
|
| 53 |
+
default=float(probs[top_idx].item()),
|
| 54 |
+
)
|
| 55 |
+
return fake_prob, top_label, all_scores
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def classify_image(pil_img: Image.Image) -> ImageClassification:
|
| 59 |
+
"""Run deepfake classification. Uses ensemble (ViT + EfficientNet) when ENSEMBLE_MODE=true,
|
| 60 |
+
falls back to ViT-only when EfficientNet is unavailable or ENSEMBLE_MODE=false.
|
| 61 |
+
"""
|
| 62 |
+
vit_fake_prob, vit_label, vit_scores = _classify_vit(pil_img)
|
| 63 |
+
models_used = [settings.IMAGE_MODEL_ID]
|
| 64 |
+
|
| 65 |
+
if not settings.ENSEMBLE_MODE:
|
| 66 |
+
logger.info(f"Image classify (ViT-only) → {vit_label} @ fake_p={vit_fake_prob:.3f}")
|
| 67 |
+
label = "Fake" if vit_fake_prob >= 0.5 else "Real"
|
| 68 |
+
return ImageClassification(
|
| 69 |
+
label=label,
|
| 70 |
+
confidence=vit_fake_prob,
|
| 71 |
+
all_scores=vit_scores,
|
| 72 |
+
models_used=models_used,
|
| 73 |
+
ensemble_method=None,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Attempt EfficientNet inference.
|
| 77 |
+
loader = get_model_loader()
|
| 78 |
+
eff_detector = loader.load_efficientnet()
|
| 79 |
+
if eff_detector is None:
|
| 80 |
+
logger.warning("EfficientNet unavailable — falling back to ViT-only")
|
| 81 |
+
label = "Fake" if vit_fake_prob >= 0.5 else "Real"
|
| 82 |
+
return ImageClassification(
|
| 83 |
+
label=label,
|
| 84 |
+
confidence=vit_fake_prob,
|
| 85 |
+
all_scores=vit_scores,
|
| 86 |
+
models_used=models_used,
|
| 87 |
+
ensemble_method=None,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
eff_result = eff_detector.detect_image(pil_img)
|
| 91 |
+
if eff_result.get("error") or eff_result.get("score") is None:
|
| 92 |
+
# BlazeFace found no face — trust ViT alone.
|
| 93 |
+
logger.info(f"EfficientNet no-face fallback → using ViT score {vit_fake_prob:.3f}")
|
| 94 |
+
label = "Fake" if vit_fake_prob >= 0.5 else "Real"
|
| 95 |
+
return ImageClassification(
|
| 96 |
+
label=label,
|
| 97 |
+
confidence=vit_fake_prob,
|
| 98 |
+
all_scores=vit_scores,
|
| 99 |
+
models_used=models_used,
|
| 100 |
+
ensemble_method="vit_only_no_face",
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
eff_fake_prob: float = eff_result["score"]
|
| 104 |
+
models_used.append(eff_result["model"])
|
| 105 |
+
|
| 106 |
+
# Simple average ensemble.
|
| 107 |
+
ensemble_prob = (vit_fake_prob + eff_fake_prob) / 2.0
|
| 108 |
+
label = "Fake" if ensemble_prob >= 0.5 else "Real"
|
| 109 |
+
logger.info(
|
| 110 |
+
f"Image classify (ensemble) → {label} | vit={vit_fake_prob:.3f} eff={eff_fake_prob:.3f} avg={ensemble_prob:.3f}"
|
| 111 |
+
)
|
| 112 |
+
return ImageClassification(
|
| 113 |
+
label=label,
|
| 114 |
+
confidence=ensemble_prob,
|
| 115 |
+
all_scores={
|
| 116 |
+
**{f"vit_{k}": v for k, v in vit_scores.items()},
|
| 117 |
+
f"efficientnet_fake": eff_fake_prob,
|
| 118 |
+
f"efficientnet_real": 1.0 - eff_fake_prob,
|
| 119 |
+
},
|
| 120 |
+
models_used=models_used,
|
| 121 |
+
ensemble_method="average",
|
| 122 |
+
)
|
| 123 |
|
| 124 |
|
| 125 |
def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:
|
services/metadata_writer.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Optional ExifTool metadata writer — embeds DeepShield verdict into analyzed file metadata.
|
| 2 |
+
|
| 3 |
+
Gated behind EXIFTOOL_PATH env var. Silently skips if ExifTool is not configured.
|
| 4 |
+
Install ExifTool: https://exiftool.org/ — set EXIFTOOL_PATH in .env to enable.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import subprocess
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
from loguru import logger
|
| 13 |
+
|
| 14 |
+
from config import settings
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _exiftool_path() -> Optional[str]:
|
| 18 |
+
path = getattr(settings, "EXIFTOOL_PATH", "")
|
| 19 |
+
if path and Path(path).is_file():
|
| 20 |
+
return path
|
| 21 |
+
return None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def write_verdict_metadata(
|
| 25 |
+
file_path: str,
|
| 26 |
+
verdict: str,
|
| 27 |
+
authenticity_score: int,
|
| 28 |
+
models_used: list[str],
|
| 29 |
+
analysis_id: str,
|
| 30 |
+
) -> bool:
|
| 31 |
+
"""Embed DeepShield analysis verdict into the file's EXIF/metadata via ExifTool.
|
| 32 |
+
|
| 33 |
+
Returns True if metadata was written, False if ExifTool is not configured or write failed.
|
| 34 |
+
"""
|
| 35 |
+
exiftool = _exiftool_path()
|
| 36 |
+
if not exiftool:
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
comment = (
|
| 40 |
+
f"DeepShield verdict: {verdict} | "
|
| 41 |
+
f"score: {authenticity_score} | "
|
| 42 |
+
f"models: {','.join(models_used)} | "
|
| 43 |
+
f"id: {analysis_id}"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
result = subprocess.run(
|
| 48 |
+
[
|
| 49 |
+
exiftool,
|
| 50 |
+
f"-Comment={comment}",
|
| 51 |
+
f"-UserComment={comment}",
|
| 52 |
+
"-overwrite_original",
|
| 53 |
+
file_path,
|
| 54 |
+
],
|
| 55 |
+
capture_output=True,
|
| 56 |
+
text=True,
|
| 57 |
+
timeout=15,
|
| 58 |
+
)
|
| 59 |
+
if result.returncode == 0:
|
| 60 |
+
logger.info(f"ExifTool wrote verdict metadata to {file_path}")
|
| 61 |
+
return True
|
| 62 |
+
else:
|
| 63 |
+
logger.warning(f"ExifTool failed (rc={result.returncode}): {result.stderr.strip()}")
|
| 64 |
+
return False
|
| 65 |
+
except FileNotFoundError:
|
| 66 |
+
logger.warning(f"ExifTool not found at {exiftool}")
|
| 67 |
+
return False
|
| 68 |
+
except subprocess.TimeoutExpired:
|
| 69 |
+
logger.warning("ExifTool timed out writing metadata")
|
| 70 |
+
return False
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.warning(f"ExifTool metadata write failed: {e}")
|
| 73 |
+
return False
|
services/video_service.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
-
from typing import List, Tuple
|
| 5 |
|
| 6 |
import cv2
|
| 7 |
import numpy as np
|
| 8 |
from loguru import logger
|
| 9 |
from PIL import Image
|
| 10 |
|
|
|
|
| 11 |
from models.model_loader import get_model_loader
|
| 12 |
-
from services.image_service import
|
| 13 |
|
| 14 |
|
| 15 |
@dataclass
|
|
@@ -18,10 +19,10 @@ class FrameAnalysis:
|
|
| 18 |
timestamp_s: float
|
| 19 |
label: str
|
| 20 |
confidence: float
|
| 21 |
-
suspicious_prob: float
|
| 22 |
is_suspicious: bool
|
| 23 |
has_face: bool = False
|
| 24 |
-
scored: bool = False
|
| 25 |
|
| 26 |
|
| 27 |
@dataclass
|
|
@@ -35,6 +36,8 @@ class VideoAggregation:
|
|
| 35 |
insufficient_faces: bool
|
| 36 |
suspicious_timestamps: List[float] = field(default_factory=list)
|
| 37 |
frames: List[FrameAnalysis] = field(default_factory=list)
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
FAKE_TOKENS = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
|
|
@@ -45,9 +48,9 @@ def _is_fake_label(label: str) -> bool:
|
|
| 45 |
return any(tok in l for tok in FAKE_TOKENS)
|
| 46 |
|
| 47 |
|
| 48 |
-
def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, float, Image.Image]]:
|
| 49 |
-
"""Uniformly sample num_frames frames from the video.
|
| 50 |
-
(frame_index, timestamp_seconds, PIL.Image).
|
| 51 |
"""
|
| 52 |
cap = cv2.VideoCapture(video_path)
|
| 53 |
if not cap.isOpened():
|
|
@@ -62,7 +65,7 @@ def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, flo
|
|
| 62 |
n = min(num_frames, total)
|
| 63 |
indices = np.linspace(0, max(0, total - 1), num=n, dtype=int).tolist()
|
| 64 |
|
| 65 |
-
out: List[Tuple[int, float, Image.Image]] = []
|
| 66 |
for idx in indices:
|
| 67 |
cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
|
| 68 |
ok, frame_bgr = cap.read()
|
|
@@ -71,40 +74,97 @@ def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, flo
|
|
| 71 |
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 72 |
pil = Image.fromarray(frame_rgb)
|
| 73 |
ts = (idx / fps) if fps > 0 else 0.0
|
| 74 |
-
out.append((int(idx), float(ts), pil))
|
| 75 |
|
| 76 |
cap.release()
|
| 77 |
logger.info(f"Extracted {len(out)}/{n} frames from video (total={total}, fps={fps:.2f})")
|
| 78 |
return out
|
| 79 |
|
| 80 |
|
| 81 |
-
MIN_FACE_FRAMES = 3
|
| 82 |
|
| 83 |
|
| 84 |
-
def
|
| 85 |
detector = get_model_loader().load_face_detector()
|
| 86 |
arr = np.array(pil)
|
| 87 |
res = detector.process(arr)
|
| 88 |
return bool(getattr(res, "multi_face_landmarks", None))
|
| 89 |
|
| 90 |
|
| 91 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
results: List[FrameAnalysis] = []
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
fake_prob = 0.0
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
results.append(
|
| 101 |
FrameAnalysis(
|
| 102 |
index=idx,
|
| 103 |
timestamp_s=ts,
|
| 104 |
-
label=
|
| 105 |
-
confidence=
|
| 106 |
suspicious_prob=fake_prob,
|
| 107 |
-
is_suspicious=(fake_prob >= 0.5) and
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
has_face=face,
|
| 109 |
scored=face,
|
| 110 |
)
|
|
@@ -112,18 +172,20 @@ def classify_frames(frames: List[Tuple[int, float, Image.Image]]) -> List[FrameA
|
|
| 112 |
return results
|
| 113 |
|
| 114 |
|
| 115 |
-
def aggregate(
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
return VideoAggregation(0, 0, 0, 0.0, 0.0, 0.0, True)
|
| 118 |
|
| 119 |
-
scored = [f for f in
|
| 120 |
num_face = len(scored)
|
| 121 |
insufficient = num_face < MIN_FACE_FRAMES
|
| 122 |
|
| 123 |
if insufficient:
|
| 124 |
-
mean_p = 0.0
|
| 125 |
-
max_p = 0.0
|
| 126 |
-
susp_ratio = 0.0
|
| 127 |
susp: List[FrameAnalysis] = []
|
| 128 |
else:
|
| 129 |
probs = [f.suspicious_prob for f in scored]
|
|
@@ -133,19 +195,28 @@ def aggregate(frames: List[FrameAnalysis]) -> VideoAggregation:
|
|
| 133 |
susp_ratio = len(susp) / len(scored)
|
| 134 |
|
| 135 |
return VideoAggregation(
|
| 136 |
-
num_frames_sampled=len(
|
| 137 |
num_face_frames=num_face,
|
| 138 |
-
num_suspicious_frames=len(susp),
|
| 139 |
mean_suspicious_prob=mean_p,
|
| 140 |
max_suspicious_prob=max_p,
|
| 141 |
suspicious_ratio=susp_ratio,
|
| 142 |
insufficient_faces=insufficient,
|
| 143 |
-
suspicious_timestamps=[round(f.timestamp_s, 2) for f in susp],
|
| 144 |
-
frames=
|
|
|
|
|
|
|
| 145 |
)
|
| 146 |
|
| 147 |
|
| 148 |
def analyze_video(video_path: str, num_frames: int = 16) -> VideoAggregation:
|
| 149 |
frames = extract_frames(video_path, num_frames=num_frames)
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
+
from typing import List, Optional, Tuple
|
| 5 |
|
| 6 |
import cv2
|
| 7 |
import numpy as np
|
| 8 |
from loguru import logger
|
| 9 |
from PIL import Image
|
| 10 |
|
| 11 |
+
from config import settings
|
| 12 |
from models.model_loader import get_model_loader
|
| 13 |
+
from services.image_service import _classify_vit
|
| 14 |
|
| 15 |
|
| 16 |
@dataclass
|
|
|
|
| 19 |
timestamp_s: float
|
| 20 |
label: str
|
| 21 |
confidence: float
|
| 22 |
+
suspicious_prob: float
|
| 23 |
is_suspicious: bool
|
| 24 |
has_face: bool = False
|
| 25 |
+
scored: bool = False
|
| 26 |
|
| 27 |
|
| 28 |
@dataclass
|
|
|
|
| 36 |
insufficient_faces: bool
|
| 37 |
suspicious_timestamps: List[float] = field(default_factory=list)
|
| 38 |
frames: List[FrameAnalysis] = field(default_factory=list)
|
| 39 |
+
models_used: List[str] = field(default_factory=list)
|
| 40 |
+
face_detector_used: str = "mediapipe"
|
| 41 |
|
| 42 |
|
| 43 |
FAKE_TOKENS = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
|
|
|
|
| 48 |
return any(tok in l for tok in FAKE_TOKENS)
|
| 49 |
|
| 50 |
|
| 51 |
+
def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, float, np.ndarray, Image.Image]]:
|
| 52 |
+
"""Uniformly sample num_frames frames from the video.
|
| 53 |
+
Returns list of (frame_index, timestamp_seconds, bgr_numpy, PIL.Image).
|
| 54 |
"""
|
| 55 |
cap = cv2.VideoCapture(video_path)
|
| 56 |
if not cap.isOpened():
|
|
|
|
| 65 |
n = min(num_frames, total)
|
| 66 |
indices = np.linspace(0, max(0, total - 1), num=n, dtype=int).tolist()
|
| 67 |
|
| 68 |
+
out: List[Tuple[int, float, np.ndarray, Image.Image]] = []
|
| 69 |
for idx in indices:
|
| 70 |
cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
|
| 71 |
ok, frame_bgr = cap.read()
|
|
|
|
| 74 |
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 75 |
pil = Image.fromarray(frame_rgb)
|
| 76 |
ts = (idx / fps) if fps > 0 else 0.0
|
| 77 |
+
out.append((int(idx), float(ts), frame_bgr, pil))
|
| 78 |
|
| 79 |
cap.release()
|
| 80 |
logger.info(f"Extracted {len(out)}/{n} frames from video (total={total}, fps={fps:.2f})")
|
| 81 |
return out
|
| 82 |
|
| 83 |
|
| 84 |
+
MIN_FACE_FRAMES = 3
|
| 85 |
|
| 86 |
|
| 87 |
+
def _has_face_mediapipe(pil: Image.Image) -> bool:
|
| 88 |
detector = get_model_loader().load_face_detector()
|
| 89 |
arr = np.array(pil)
|
| 90 |
res = detector.process(arr)
|
| 91 |
return bool(getattr(res, "multi_face_landmarks", None))
|
| 92 |
|
| 93 |
|
| 94 |
+
def _analyze_with_efficientnet(
|
| 95 |
+
frames: List[Tuple[int, float, np.ndarray, Image.Image]],
|
| 96 |
+
) -> Tuple[List[FrameAnalysis], str, List[str]]:
|
| 97 |
+
"""Primary path: use EfficientNet + BlazeFace per-frame. Returns (frame_results, detector_used, models_used)."""
|
| 98 |
+
loader = get_model_loader()
|
| 99 |
+
eff = loader.load_efficientnet()
|
| 100 |
+
|
| 101 |
+
if eff is None:
|
| 102 |
+
logger.warning("EfficientNet unavailable — falling back to ViT video pipeline")
|
| 103 |
+
return _analyze_with_vit(frames), "mediapipe", [settings.IMAGE_MODEL_ID]
|
| 104 |
+
|
| 105 |
results: List[FrameAnalysis] = []
|
| 106 |
+
face_detector_used = "blazeface"
|
| 107 |
+
models_used = [f"{settings.EFFICIENTNET_MODEL}_{settings.EFFICIENTNET_TRAIN_DB}"]
|
| 108 |
+
|
| 109 |
+
for idx, ts, frame_bgr, pil in frames:
|
| 110 |
+
# Pass RGB to EfficientNet (process_image expects RGB array).
|
| 111 |
+
frame_rgb = frame_bgr[..., ::-1].copy()
|
| 112 |
+
frame_data = eff.face_extractor.process_image(img=frame_rgb)
|
| 113 |
+
faces: list = frame_data.get("faces", [])
|
| 114 |
+
has_face = bool(faces)
|
| 115 |
+
|
| 116 |
+
if not has_face:
|
| 117 |
+
# Fallback: check MediaPipe so we don't silently miss faces.
|
| 118 |
+
has_face = _has_face_mediapipe(pil)
|
| 119 |
+
if has_face:
|
| 120 |
+
face_detector_used = "blazeface+mediapipe_fallback"
|
| 121 |
+
|
| 122 |
fake_prob = 0.0
|
| 123 |
+
label = "unknown"
|
| 124 |
+
if has_face and faces:
|
| 125 |
+
# Run EfficientNet on the best face from BlazeFace.
|
| 126 |
+
face_t = eff._face_tensor(faces[0])
|
| 127 |
+
import torch
|
| 128 |
+
with torch.inference_mode():
|
| 129 |
+
logit = eff.net(face_t.unsqueeze(0).to(eff.device))
|
| 130 |
+
from scipy.special import expit
|
| 131 |
+
fake_prob = float(expit(logit.cpu().numpy().item()))
|
| 132 |
+
label = "Fake" if fake_prob > 0.5 else "Real"
|
| 133 |
+
elif not has_face:
|
| 134 |
+
label = "no_face"
|
| 135 |
+
|
| 136 |
results.append(
|
| 137 |
FrameAnalysis(
|
| 138 |
index=idx,
|
| 139 |
timestamp_s=ts,
|
| 140 |
+
label=label,
|
| 141 |
+
confidence=fake_prob,
|
| 142 |
suspicious_prob=fake_prob,
|
| 143 |
+
is_suspicious=(fake_prob >= 0.5) and has_face,
|
| 144 |
+
has_face=has_face,
|
| 145 |
+
scored=has_face,
|
| 146 |
+
)
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
return results, face_detector_used, models_used
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _analyze_with_vit(
|
| 153 |
+
frames: List[Tuple[int, float, np.ndarray, Image.Image]],
|
| 154 |
+
) -> List[FrameAnalysis]:
|
| 155 |
+
"""Fallback: original ViT-per-frame pipeline (MediaPipe face gate)."""
|
| 156 |
+
results: List[FrameAnalysis] = []
|
| 157 |
+
for idx, ts, _bgr, pil in frames:
|
| 158 |
+
face = _has_face_mediapipe(pil)
|
| 159 |
+
vit_fake_prob, vit_label, _ = _classify_vit(pil)
|
| 160 |
+
results.append(
|
| 161 |
+
FrameAnalysis(
|
| 162 |
+
index=idx,
|
| 163 |
+
timestamp_s=ts,
|
| 164 |
+
label=vit_label,
|
| 165 |
+
confidence=vit_fake_prob,
|
| 166 |
+
suspicious_prob=vit_fake_prob,
|
| 167 |
+
is_suspicious=(vit_fake_prob >= 0.5) and face,
|
| 168 |
has_face=face,
|
| 169 |
scored=face,
|
| 170 |
)
|
|
|
|
| 172 |
return results
|
| 173 |
|
| 174 |
|
| 175 |
+
def aggregate(
|
| 176 |
+
frame_results: List[FrameAnalysis],
|
| 177 |
+
models_used: Optional[List[str]] = None,
|
| 178 |
+
face_detector_used: str = "mediapipe",
|
| 179 |
+
) -> VideoAggregation:
|
| 180 |
+
if not frame_results:
|
| 181 |
return VideoAggregation(0, 0, 0, 0.0, 0.0, 0.0, True)
|
| 182 |
|
| 183 |
+
scored = [f for f in frame_results if f.scored]
|
| 184 |
num_face = len(scored)
|
| 185 |
insufficient = num_face < MIN_FACE_FRAMES
|
| 186 |
|
| 187 |
if insufficient:
|
| 188 |
+
mean_p, max_p, susp_ratio = 0.0, 0.0, 0.0
|
|
|
|
|
|
|
| 189 |
susp: List[FrameAnalysis] = []
|
| 190 |
else:
|
| 191 |
probs = [f.suspicious_prob for f in scored]
|
|
|
|
| 195 |
susp_ratio = len(susp) / len(scored)
|
| 196 |
|
| 197 |
return VideoAggregation(
|
| 198 |
+
num_frames_sampled=len(frame_results),
|
| 199 |
num_face_frames=num_face,
|
| 200 |
+
num_suspicious_frames=len(susp) if not insufficient else 0,
|
| 201 |
mean_suspicious_prob=mean_p,
|
| 202 |
max_suspicious_prob=max_p,
|
| 203 |
suspicious_ratio=susp_ratio,
|
| 204 |
insufficient_faces=insufficient,
|
| 205 |
+
suspicious_timestamps=[round(f.timestamp_s, 2) for f in (susp if not insufficient else [])],
|
| 206 |
+
frames=frame_results,
|
| 207 |
+
models_used=models_used or [settings.IMAGE_MODEL_ID],
|
| 208 |
+
face_detector_used=face_detector_used,
|
| 209 |
)
|
| 210 |
|
| 211 |
|
| 212 |
def analyze_video(video_path: str, num_frames: int = 16) -> VideoAggregation:
|
| 213 |
frames = extract_frames(video_path, num_frames=num_frames)
|
| 214 |
+
|
| 215 |
+
if settings.ENSEMBLE_MODE:
|
| 216 |
+
frame_results, face_detector_used, models_used = _analyze_with_efficientnet(frames)
|
| 217 |
+
else:
|
| 218 |
+
frame_results = _analyze_with_vit(frames)
|
| 219 |
+
face_detector_used = "mediapipe"
|
| 220 |
+
models_used = [settings.IMAGE_MODEL_ID]
|
| 221 |
+
|
| 222 |
+
return aggregate(frame_results, models_used=models_used, face_detector_used=face_detector_used)
|
v1/__pycache__/__init__.cpython-311.pyc
DELETED
|
Binary file (165 Bytes)
|
|
|
v1/__pycache__/analyze.cpython-311.pyc
DELETED
|
Binary file (21.6 kB)
|
|
|
v1/__pycache__/auth.cpython-311.pyc
DELETED
|
Binary file (3.82 kB)
|
|
|
v1/__pycache__/health.cpython-311.pyc
DELETED
|
Binary file (556 Bytes)
|
|
|
v1/__pycache__/history.cpython-311.pyc
DELETED
|
Binary file (5.19 kB)
|
|
|
v1/__pycache__/report.cpython-311.pyc
DELETED
|
Binary file (4.29 kB)
|
|
|
v1/analyze.py
CHANGED
|
@@ -55,6 +55,7 @@ from services.text_service import (
|
|
| 55 |
score_sensationalism,
|
| 56 |
)
|
| 57 |
from services.video_service import analyze_video
|
|
|
|
| 58 |
from utils.file_handler import read_upload_bytes, save_upload_to_tempfile
|
| 59 |
from utils.scoring import compute_authenticity_score, get_verdict_label
|
| 60 |
|
|
@@ -89,7 +90,10 @@ async def analyze_image(
|
|
| 89 |
heatmap_status = "success"
|
| 90 |
heatmap = ""
|
| 91 |
try:
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
| 93 |
stages.append("heatmap_generation")
|
| 94 |
except Exception as e: # noqa: BLE001
|
| 95 |
logger.warning(f"Heatmap generation failed, continuing: {e}")
|
|
@@ -155,6 +159,7 @@ async def analyze_image(
|
|
| 155 |
stages_completed=stages,
|
| 156 |
total_duration_ms=duration_ms,
|
| 157 |
model_used=settings.IMAGE_MODEL_ID,
|
|
|
|
| 158 |
),
|
| 159 |
)
|
| 160 |
|
|
@@ -218,11 +223,12 @@ async def analyze_video_endpoint(
|
|
| 218 |
stages.append("frame_extraction")
|
| 219 |
stages.append("frame_classification")
|
| 220 |
stages.append("aggregation")
|
| 221 |
-
|
| 222 |
try:
|
| 223 |
os.unlink(path)
|
| 224 |
except OSError:
|
| 225 |
pass
|
|
|
|
| 226 |
|
| 227 |
if agg.insufficient_faces:
|
| 228 |
score = 50
|
|
@@ -271,6 +277,7 @@ async def analyze_video_endpoint(
|
|
| 271 |
stages_completed=stages,
|
| 272 |
total_duration_ms=duration_ms,
|
| 273 |
model_used=settings.IMAGE_MODEL_ID,
|
|
|
|
| 274 |
),
|
| 275 |
)
|
| 276 |
|
|
@@ -290,6 +297,23 @@ async def analyze_video_endpoint(
|
|
| 290 |
f"frames={agg.num_frames_sampled} susp={agg.num_suspicious_frames}"
|
| 291 |
)
|
| 292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
# Phase 12: LLM explainability card
|
| 294 |
try:
|
| 295 |
response.llm_summary = generate_llm_summary(
|
|
|
|
| 55 |
score_sensationalism,
|
| 56 |
)
|
| 57 |
from services.video_service import analyze_video
|
| 58 |
+
from services.metadata_writer import write_verdict_metadata
|
| 59 |
from utils.file_handler import read_upload_bytes, save_upload_to_tempfile
|
| 60 |
from utils.scoring import compute_authenticity_score, get_verdict_label
|
| 61 |
|
|
|
|
| 90 |
heatmap_status = "success"
|
| 91 |
heatmap = ""
|
| 92 |
try:
|
| 93 |
+
model_family = "efficientnet" if settings.ENSEMBLE_MODE else "vit"
|
| 94 |
+
heatmap, heatmap_source = generate_heatmap_base64(pil, model_family=model_family)
|
| 95 |
+
if not heatmap:
|
| 96 |
+
heatmap_status = heatmap_source # "none" or "fallback"
|
| 97 |
stages.append("heatmap_generation")
|
| 98 |
except Exception as e: # noqa: BLE001
|
| 99 |
logger.warning(f"Heatmap generation failed, continuing: {e}")
|
|
|
|
| 159 |
stages_completed=stages,
|
| 160 |
total_duration_ms=duration_ms,
|
| 161 |
model_used=settings.IMAGE_MODEL_ID,
|
| 162 |
+
models_used=clf.models_used,
|
| 163 |
),
|
| 164 |
)
|
| 165 |
|
|
|
|
| 223 |
stages.append("frame_extraction")
|
| 224 |
stages.append("frame_classification")
|
| 225 |
stages.append("aggregation")
|
| 226 |
+
except Exception:
|
| 227 |
try:
|
| 228 |
os.unlink(path)
|
| 229 |
except OSError:
|
| 230 |
pass
|
| 231 |
+
raise
|
| 232 |
|
| 233 |
if agg.insufficient_faces:
|
| 234 |
score = 50
|
|
|
|
| 277 |
stages_completed=stages,
|
| 278 |
total_duration_ms=duration_ms,
|
| 279 |
model_used=settings.IMAGE_MODEL_ID,
|
| 280 |
+
models_used=agg.models_used,
|
| 281 |
),
|
| 282 |
)
|
| 283 |
|
|
|
|
| 297 |
f"frames={agg.num_frames_sampled} susp={agg.num_suspicious_frames}"
|
| 298 |
)
|
| 299 |
|
| 300 |
+
# Write verdict into video metadata (ExifTool, optional — gated by EXIFTOOL_PATH).
|
| 301 |
+
try:
|
| 302 |
+
write_verdict_metadata(
|
| 303 |
+
file_path=path,
|
| 304 |
+
verdict=label,
|
| 305 |
+
authenticity_score=score,
|
| 306 |
+
models_used=agg.models_used,
|
| 307 |
+
analysis_id=str(record.id),
|
| 308 |
+
)
|
| 309 |
+
except Exception as e: # noqa: BLE001
|
| 310 |
+
logger.warning(f"Metadata write failed: {e}")
|
| 311 |
+
finally:
|
| 312 |
+
try:
|
| 313 |
+
os.unlink(path)
|
| 314 |
+
except OSError:
|
| 315 |
+
pass
|
| 316 |
+
|
| 317 |
# Phase 12: LLM explainability card
|
| 318 |
try:
|
| 319 |
response.llm_summary = generate_llm_summary(
|