Spyderzz commited on
Commit
3909c31
·
1 Parent(s): 4ef8b6a

feat: merge DeepShield1 EfficientNet ensemble into production deployment

Browse files
Files changed (45) hide show
  1. config.py +7 -0
  2. models/heatmap_generator.py +75 -6
  3. models/icpr2020dfdc/.gitignore +5 -0
  4. models/icpr2020dfdc/.travis.yml +15 -0
  5. models/icpr2020dfdc/LICENSE +674 -0
  6. models/icpr2020dfdc/README.md +120 -0
  7. models/icpr2020dfdc/architectures/__init__.py +0 -0
  8. models/icpr2020dfdc/architectures/externals/__init__.py +1 -0
  9. models/icpr2020dfdc/architectures/externals/xception.py +236 -0
  10. models/icpr2020dfdc/architectures/fornet.py +245 -0
  11. models/icpr2020dfdc/architectures/tripletnet.py +44 -0
  12. models/icpr2020dfdc/architectures/weights.py +24 -0
  13. models/icpr2020dfdc/blazeface/__init__.py +3 -0
  14. models/icpr2020dfdc/blazeface/anchors.npy +3 -0
  15. models/icpr2020dfdc/blazeface/blazeface.pth +3 -0
  16. models/icpr2020dfdc/blazeface/blazeface.py +417 -0
  17. models/icpr2020dfdc/blazeface/face_extract.py +470 -0
  18. models/icpr2020dfdc/blazeface/read_video.py +213 -0
  19. models/icpr2020dfdc/environment.yml +25 -0
  20. models/icpr2020dfdc/extract_faces.py +346 -0
  21. models/icpr2020dfdc/index_celebdf.py +85 -0
  22. models/icpr2020dfdc/index_dfdc.py +94 -0
  23. models/icpr2020dfdc/index_ffpp.py +92 -0
  24. models/icpr2020dfdc/isplutils/__init__.py +0 -0
  25. models/icpr2020dfdc/isplutils/data.py +263 -0
  26. models/icpr2020dfdc/isplutils/data_siamese.py +78 -0
  27. models/icpr2020dfdc/isplutils/split.py +135 -0
  28. models/icpr2020dfdc/isplutils/utils.py +247 -0
  29. models/icpr2020dfdc/test_model.py +270 -0
  30. models/icpr2020dfdc/train_binclass.py +460 -0
  31. models/icpr2020dfdc/train_triplet.py +459 -0
  32. models/model_loader.py +18 -0
  33. requirements.txt +7 -0
  34. schemas/common.py +1 -0
  35. services/efficientnet_service.py +209 -0
  36. services/image_service.py +81 -8
  37. services/metadata_writer.py +73 -0
  38. services/video_service.py +104 -33
  39. v1/__pycache__/__init__.cpython-311.pyc +0 -0
  40. v1/__pycache__/analyze.cpython-311.pyc +0 -0
  41. v1/__pycache__/auth.cpython-311.pyc +0 -0
  42. v1/__pycache__/health.cpython-311.pyc +0 -0
  43. v1/__pycache__/history.cpython-311.pyc +0 -0
  44. v1/__pycache__/report.cpython-311.pyc +0 -0
  45. v1/analyze.py +26 -2
config.py CHANGED
@@ -42,6 +42,13 @@ class Settings(BaseSettings):
42
  LLM_API_KEY: str = ""
43
  LLM_MODEL: str = "gemini-2.5-pro" # or "gpt-4o"
44
 
 
 
 
 
 
 
 
45
  # Auth
46
  JWT_SECRET_KEY: str = "change-me-in-production"
47
  JWT_ALGORITHM: str = "HS256"
 
42
  LLM_API_KEY: str = ""
43
  LLM_MODEL: str = "gemini-2.5-pro" # or "gpt-4o"
44
 
45
+ # EfficientNet (ICPR2020 / DeepShield1 merge)
46
+ EFFICIENTNET_MODEL: str = "EfficientNetAutoAttB4"
47
+ EFFICIENTNET_TRAIN_DB: str = "DFDC"
48
+ ENSEMBLE_MODE: bool = True # run both ViT + EfficientNet and average scores
49
+ VIDEO_SAMPLE_FRAMES: int = 16 # frames to sample per video for inference
50
+ EXIFTOOL_PATH: str = "" # full path to ExifTool binary; empty = metadata write disabled
51
+
52
  # Auth
53
  JWT_SECRET_KEY: str = "change-me-in-production"
54
  JWT_ALGORITHM: str = "HS256"
models/heatmap_generator.py CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
2
 
3
  import base64
4
  import io
5
- from typing import Optional
6
 
7
  import cv2
8
  import numpy as np
@@ -107,15 +107,84 @@ def _compute_gradcam_pp(
107
  return grayscale_cam, rgb_float
108
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  def generate_heatmap_base64(
111
  pil_img: Image.Image,
112
  target_class_idx: Optional[int] = None,
113
- ) -> str:
114
- """Produce a base64 data-URL PNG of the Grad-CAM++ overlay for the given image."""
115
- grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
117
- logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]})")
118
- return _encode_overlay_to_base64(overlay)
119
 
120
 
121
  def generate_boxes_base64(
 
2
 
3
  import base64
4
  import io
5
+ from typing import Literal, Optional
6
 
7
  import cv2
8
  import numpy as np
 
107
  return grayscale_cam, rgb_float
108
 
109
 
110
+ def _compute_gradcam_pp_efficientnet(
111
+ pil_img: Image.Image,
112
+ ) -> tuple[np.ndarray, np.ndarray, Literal["attention", "gradcam++"]]:
113
+ """Grad-CAM++ for EfficientNetAutoAttB4.
114
+
115
+ Returns (grayscale_cam, rgb_float, heatmap_source).
116
+ Prefers the model's built-in attention map; falls back to Grad-CAM++ on the
117
+ last MBConv block if attention extraction fails.
118
+ """
119
+ loader = get_model_loader()
120
+ eff = loader.load_efficientnet()
121
+ if eff is None:
122
+ raise RuntimeError("EfficientNet not loaded")
123
+
124
+ if pil_img.mode != "RGB":
125
+ pil_img = pil_img.convert("RGB")
126
+ img_np = np.array(pil_img)
127
+
128
+ # Prepare face crop (same path as detect_image).
129
+ frame_data = eff.face_extractor.process_image(img=img_np)
130
+ faces: list = frame_data.get("faces", [])
131
+ if not faces:
132
+ raise ValueError("no_face")
133
+
134
+ face_t = eff._face_tensor(faces[0]).unsqueeze(0).to(eff.device)
135
+
136
+ # Resize the face crop to float [0,1] for overlay.
137
+ face_np = faces[0]
138
+ h, w = face_np.shape[:2]
139
+ rgb_float = face_np.astype(np.float32) / 255.0
140
+ if rgb_float.shape[:2] != (224, 224):
141
+ rgb_float = cv2.resize(rgb_float, (224, 224)).astype(np.float32)
142
+
143
+ # Try Grad-CAM++ on last MBConv block (_blocks[-1]).
144
+ try:
145
+ net = eff.net
146
+ target_layers = [net.efficientnet._blocks[-1]]
147
+
148
+ face_t.requires_grad_(True)
149
+ for p in net.parameters():
150
+ p.requires_grad_(True)
151
+
152
+ with GradCAMPlusPlus(model=net, target_layers=target_layers) as cam:
153
+ grayscale_cam = cam(input_tensor=face_t, targets=None)[0]
154
+
155
+ return grayscale_cam, rgb_float, "gradcam++"
156
+ except Exception as e:
157
+ logger.warning(f"EfficientNet Grad-CAM++ failed ({e}), using uniform fallback")
158
+ grayscale_cam = np.ones((224, 224), dtype=np.float32) * 0.5
159
+ return grayscale_cam, rgb_float, "gradcam++"
160
+
161
+
162
  def generate_heatmap_base64(
163
  pil_img: Image.Image,
164
  target_class_idx: Optional[int] = None,
165
+ model_family: Literal["vit", "efficientnet"] = "vit",
166
+ ) -> tuple[str, str]:
167
+ """Produce a base64 data-URL PNG of the Grad-CAM++ overlay.
168
+
169
+ Returns (base64_png, heatmap_source) where heatmap_source is one of
170
+ "gradcam++", "attention", "fallback", "none".
171
+ """
172
+ if model_family == "efficientnet":
173
+ try:
174
+ grayscale_cam, rgb_float, source = _compute_gradcam_pp_efficientnet(pil_img)
175
+ except ValueError:
176
+ logger.info("EfficientNet heatmap skipped — no face detected")
177
+ return "", "none"
178
+ except Exception as e:
179
+ logger.warning(f"EfficientNet heatmap failed: {e}")
180
+ return "", "fallback"
181
+ else:
182
+ grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
183
+ source = "gradcam++"
184
+
185
  overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
186
+ logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]}) source={source}")
187
+ return _encode_overlay_to_base64(overlay), source
188
 
189
 
190
  def generate_boxes_base64(
models/icpr2020dfdc/.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ .idea/
3
+ .DS_Store
4
+ .ipynb_checkpoints/
5
+ __pycache__/
models/icpr2020dfdc/.travis.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ language: python
2
+ python:
3
+ - "3.6.9"
4
+ install:
5
+ - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/miniconda.sh
6
+ - bash $HOME/miniconda.sh -bfp $HOME/miniconda3
7
+ - export PATH=$HOME/miniconda3/bin:$PATH
8
+ - conda env create -f environment.yml
9
+ before_script:
10
+ - source activate icpr2020
11
+ - cd test
12
+ script:
13
+ - python -m unittest test_dfdc.TestDFDC
14
+ - python -m unittest test_ffpp.TestFFPP
15
+
models/icpr2020dfdc/LICENSE ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU GENERAL PUBLIC LICENSE
2
+ Version 3, 29 June 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU General Public License is a free, copyleft license for
11
+ software and other kinds of works.
12
+
13
+ The licenses for most software and other practical works are designed
14
+ to take away your freedom to share and change the works. By contrast,
15
+ the GNU General Public License is intended to guarantee your freedom to
16
+ share and change all versions of a program--to make sure it remains free
17
+ software for all its users. We, the Free Software Foundation, use the
18
+ GNU General Public License for most of our software; it applies also to
19
+ any other work released this way by its authors. You can apply it to
20
+ your programs, too.
21
+
22
+ When we speak of free software, we are referring to freedom, not
23
+ price. Our General Public Licenses are designed to make sure that you
24
+ have the freedom to distribute copies of free software (and charge for
25
+ them if you wish), that you receive source code or can get it if you
26
+ want it, that you can change the software or use pieces of it in new
27
+ free programs, and that you know you can do these things.
28
+
29
+ To protect your rights, we need to prevent others from denying you
30
+ these rights or asking you to surrender the rights. Therefore, you have
31
+ certain responsibilities if you distribute copies of the software, or if
32
+ you modify it: responsibilities to respect the freedom of others.
33
+
34
+ For example, if you distribute copies of such a program, whether
35
+ gratis or for a fee, you must pass on to the recipients the same
36
+ freedoms that you received. You must make sure that they, too, receive
37
+ or can get the source code. And you must show them these terms so they
38
+ know their rights.
39
+
40
+ Developers that use the GNU GPL protect your rights with two steps:
41
+ (1) assert copyright on the software, and (2) offer you this License
42
+ giving you legal permission to copy, distribute and/or modify it.
43
+
44
+ For the developers' and authors' protection, the GPL clearly explains
45
+ that there is no warranty for this free software. For both users' and
46
+ authors' sake, the GPL requires that modified versions be marked as
47
+ changed, so that their problems will not be attributed erroneously to
48
+ authors of previous versions.
49
+
50
+ Some devices are designed to deny users access to install or run
51
+ modified versions of the software inside them, although the manufacturer
52
+ can do so. This is fundamentally incompatible with the aim of
53
+ protecting users' freedom to change the software. The systematic
54
+ pattern of such abuse occurs in the area of products for individuals to
55
+ use, which is precisely where it is most unacceptable. Therefore, we
56
+ have designed this version of the GPL to prohibit the practice for those
57
+ products. If such problems arise substantially in other domains, we
58
+ stand ready to extend this provision to those domains in future versions
59
+ of the GPL, as needed to protect the freedom of users.
60
+
61
+ Finally, every program is threatened constantly by software patents.
62
+ States should not allow patents to restrict development and use of
63
+ software on general-purpose computers, but in those that do, we wish to
64
+ avoid the special danger that patents applied to a free program could
65
+ make it effectively proprietary. To prevent this, the GPL assures that
66
+ patents cannot be used to render the program non-free.
67
+
68
+ The precise terms and conditions for copying, distribution and
69
+ modification follow.
70
+
71
+ TERMS AND CONDITIONS
72
+
73
+ 0. Definitions.
74
+
75
+ "This License" refers to version 3 of the GNU General Public License.
76
+
77
+ "Copyright" also means copyright-like laws that apply to other kinds of
78
+ works, such as semiconductor masks.
79
+
80
+ "The Program" refers to any copyrightable work licensed under this
81
+ License. Each licensee is addressed as "you". "Licensees" and
82
+ "recipients" may be individuals or organizations.
83
+
84
+ To "modify" a work means to copy from or adapt all or part of the work
85
+ in a fashion requiring copyright permission, other than the making of an
86
+ exact copy. The resulting work is called a "modified version" of the
87
+ earlier work or a work "based on" the earlier work.
88
+
89
+ A "covered work" means either the unmodified Program or a work based
90
+ on the Program.
91
+
92
+ To "propagate" a work means to do anything with it that, without
93
+ permission, would make you directly or secondarily liable for
94
+ infringement under applicable copyright law, except executing it on a
95
+ computer or modifying a private copy. Propagation includes copying,
96
+ distribution (with or without modification), making available to the
97
+ public, and in some countries other activities as well.
98
+
99
+ To "convey" a work means any kind of propagation that enables other
100
+ parties to make or receive copies. Mere interaction with a user through
101
+ a computer network, with no transfer of a copy, is not conveying.
102
+
103
+ An interactive user interface displays "Appropriate Legal Notices"
104
+ to the extent that it includes a convenient and prominently visible
105
+ feature that (1) displays an appropriate copyright notice, and (2)
106
+ tells the user that there is no warranty for the work (except to the
107
+ extent that warranties are provided), that licensees may convey the
108
+ work under this License, and how to view a copy of this License. If
109
+ the interface presents a list of user commands or options, such as a
110
+ menu, a prominent item in the list meets this criterion.
111
+
112
+ 1. Source Code.
113
+
114
+ The "source code" for a work means the preferred form of the work
115
+ for making modifications to it. "Object code" means any non-source
116
+ form of a work.
117
+
118
+ A "Standard Interface" means an interface that either is an official
119
+ standard defined by a recognized standards body, or, in the case of
120
+ interfaces specified for a particular programming language, one that
121
+ is widely used among developers working in that language.
122
+
123
+ The "System Libraries" of an executable work include anything, other
124
+ than the work as a whole, that (a) is included in the normal form of
125
+ packaging a Major Component, but which is not part of that Major
126
+ Component, and (b) serves only to enable use of the work with that
127
+ Major Component, or to implement a Standard Interface for which an
128
+ implementation is available to the public in source code form. A
129
+ "Major Component", in this context, means a major essential component
130
+ (kernel, window system, and so on) of the specific operating system
131
+ (if any) on which the executable work runs, or a compiler used to
132
+ produce the work, or an object code interpreter used to run it.
133
+
134
+ The "Corresponding Source" for a work in object code form means all
135
+ the source code needed to generate, install, and (for an executable
136
+ work) run the object code and to modify the work, including scripts to
137
+ control those activities. However, it does not include the work's
138
+ System Libraries, or general-purpose tools or generally available free
139
+ programs which are used unmodified in performing those activities but
140
+ which are not part of the work. For example, Corresponding Source
141
+ includes interface definition files associated with source files for
142
+ the work, and the source code for shared libraries and dynamically
143
+ linked subprograms that the work is specifically designed to require,
144
+ such as by intimate data communication or control flow between those
145
+ subprograms and other parts of the work.
146
+
147
+ The Corresponding Source need not include anything that users
148
+ can regenerate automatically from other parts of the Corresponding
149
+ Source.
150
+
151
+ The Corresponding Source for a work in source code form is that
152
+ same work.
153
+
154
+ 2. Basic Permissions.
155
+
156
+ All rights granted under this License are granted for the term of
157
+ copyright on the Program, and are irrevocable provided the stated
158
+ conditions are met. This License explicitly affirms your unlimited
159
+ permission to run the unmodified Program. The output from running a
160
+ covered work is covered by this License only if the output, given its
161
+ content, constitutes a covered work. This License acknowledges your
162
+ rights of fair use or other equivalent, as provided by copyright law.
163
+
164
+ You may make, run and propagate covered works that you do not
165
+ convey, without conditions so long as your license otherwise remains
166
+ in force. You may convey covered works to others for the sole purpose
167
+ of having them make modifications exclusively for you, or provide you
168
+ with facilities for running those works, provided that you comply with
169
+ the terms of this License in conveying all material for which you do
170
+ not control copyright. Those thus making or running the covered works
171
+ for you must do so exclusively on your behalf, under your direction
172
+ and control, on terms that prohibit them from making any copies of
173
+ your copyrighted material outside their relationship with you.
174
+
175
+ Conveying under any other circumstances is permitted solely under
176
+ the conditions stated below. Sublicensing is not allowed; section 10
177
+ makes it unnecessary.
178
+
179
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180
+
181
+ No covered work shall be deemed part of an effective technological
182
+ measure under any applicable law fulfilling obligations under article
183
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184
+ similar laws prohibiting or restricting circumvention of such
185
+ measures.
186
+
187
+ When you convey a covered work, you waive any legal power to forbid
188
+ circumvention of technological measures to the extent such circumvention
189
+ is effected by exercising rights under this License with respect to
190
+ the covered work, and you disclaim any intention to limit operation or
191
+ modification of the work as a means of enforcing, against the work's
192
+ users, your or third parties' legal rights to forbid circumvention of
193
+ technological measures.
194
+
195
+ 4. Conveying Verbatim Copies.
196
+
197
+ You may convey verbatim copies of the Program's source code as you
198
+ receive it, in any medium, provided that you conspicuously and
199
+ appropriately publish on each copy an appropriate copyright notice;
200
+ keep intact all notices stating that this License and any
201
+ non-permissive terms added in accord with section 7 apply to the code;
202
+ keep intact all notices of the absence of any warranty; and give all
203
+ recipients a copy of this License along with the Program.
204
+
205
+ You may charge any price or no price for each copy that you convey,
206
+ and you may offer support or warranty protection for a fee.
207
+
208
+ 5. Conveying Modified Source Versions.
209
+
210
+ You may convey a work based on the Program, or the modifications to
211
+ produce it from the Program, in the form of source code under the
212
+ terms of section 4, provided that you also meet all of these conditions:
213
+
214
+ a) The work must carry prominent notices stating that you modified
215
+ it, and giving a relevant date.
216
+
217
+ b) The work must carry prominent notices stating that it is
218
+ released under this License and any conditions added under section
219
+ 7. This requirement modifies the requirement in section 4 to
220
+ "keep intact all notices".
221
+
222
+ c) You must license the entire work, as a whole, under this
223
+ License to anyone who comes into possession of a copy. This
224
+ License will therefore apply, along with any applicable section 7
225
+ additional terms, to the whole of the work, and all its parts,
226
+ regardless of how they are packaged. This License gives no
227
+ permission to license the work in any other way, but it does not
228
+ invalidate such permission if you have separately received it.
229
+
230
+ d) If the work has interactive user interfaces, each must display
231
+ Appropriate Legal Notices; however, if the Program has interactive
232
+ interfaces that do not display Appropriate Legal Notices, your
233
+ work need not make them do so.
234
+
235
+ A compilation of a covered work with other separate and independent
236
+ works, which are not by their nature extensions of the covered work,
237
+ and which are not combined with it such as to form a larger program,
238
+ in or on a volume of a storage or distribution medium, is called an
239
+ "aggregate" if the compilation and its resulting copyright are not
240
+ used to limit the access or legal rights of the compilation's users
241
+ beyond what the individual works permit. Inclusion of a covered work
242
+ in an aggregate does not cause this License to apply to the other
243
+ parts of the aggregate.
244
+
245
+ 6. Conveying Non-Source Forms.
246
+
247
+ You may convey a covered work in object code form under the terms
248
+ of sections 4 and 5, provided that you also convey the
249
+ machine-readable Corresponding Source under the terms of this License,
250
+ in one of these ways:
251
+
252
+ a) Convey the object code in, or embodied in, a physical product
253
+ (including a physical distribution medium), accompanied by the
254
+ Corresponding Source fixed on a durable physical medium
255
+ customarily used for software interchange.
256
+
257
+ b) Convey the object code in, or embodied in, a physical product
258
+ (including a physical distribution medium), accompanied by a
259
+ written offer, valid for at least three years and valid for as
260
+ long as you offer spare parts or customer support for that product
261
+ model, to give anyone who possesses the object code either (1) a
262
+ copy of the Corresponding Source for all the software in the
263
+ product that is covered by this License, on a durable physical
264
+ medium customarily used for software interchange, for a price no
265
+ more than your reasonable cost of physically performing this
266
+ conveying of source, or (2) access to copy the
267
+ Corresponding Source from a network server at no charge.
268
+
269
+ c) Convey individual copies of the object code with a copy of the
270
+ written offer to provide the Corresponding Source. This
271
+ alternative is allowed only occasionally and noncommercially, and
272
+ only if you received the object code with such an offer, in accord
273
+ with subsection 6b.
274
+
275
+ d) Convey the object code by offering access from a designated
276
+ place (gratis or for a charge), and offer equivalent access to the
277
+ Corresponding Source in the same way through the same place at no
278
+ further charge. You need not require recipients to copy the
279
+ Corresponding Source along with the object code. If the place to
280
+ copy the object code is a network server, the Corresponding Source
281
+ may be on a different server (operated by you or a third party)
282
+ that supports equivalent copying facilities, provided you maintain
283
+ clear directions next to the object code saying where to find the
284
+ Corresponding Source. Regardless of what server hosts the
285
+ Corresponding Source, you remain obligated to ensure that it is
286
+ available for as long as needed to satisfy these requirements.
287
+
288
+ e) Convey the object code using peer-to-peer transmission, provided
289
+ you inform other peers where the object code and Corresponding
290
+ Source of the work are being offered to the general public at no
291
+ charge under subsection 6d.
292
+
293
+ A separable portion of the object code, whose source code is excluded
294
+ from the Corresponding Source as a System Library, need not be
295
+ included in conveying the object code work.
296
+
297
+ A "User Product" is either (1) a "consumer product", which means any
298
+ tangible personal property which is normally used for personal, family,
299
+ or household purposes, or (2) anything designed or sold for incorporation
300
+ into a dwelling. In determining whether a product is a consumer product,
301
+ doubtful cases shall be resolved in favor of coverage. For a particular
302
+ product received by a particular user, "normally used" refers to a
303
+ typical or common use of that class of product, regardless of the status
304
+ of the particular user or of the way in which the particular user
305
+ actually uses, or expects or is expected to use, the product. A product
306
+ is a consumer product regardless of whether the product has substantial
307
+ commercial, industrial or non-consumer uses, unless such uses represent
308
+ the only significant mode of use of the product.
309
+
310
+ "Installation Information" for a User Product means any methods,
311
+ procedures, authorization keys, or other information required to install
312
+ and execute modified versions of a covered work in that User Product from
313
+ a modified version of its Corresponding Source. The information must
314
+ suffice to ensure that the continued functioning of the modified object
315
+ code is in no case prevented or interfered with solely because
316
+ modification has been made.
317
+
318
+ If you convey an object code work under this section in, or with, or
319
+ specifically for use in, a User Product, and the conveying occurs as
320
+ part of a transaction in which the right of possession and use of the
321
+ User Product is transferred to the recipient in perpetuity or for a
322
+ fixed term (regardless of how the transaction is characterized), the
323
+ Corresponding Source conveyed under this section must be accompanied
324
+ by the Installation Information. But this requirement does not apply
325
+ if neither you nor any third party retains the ability to install
326
+ modified object code on the User Product (for example, the work has
327
+ been installed in ROM).
328
+
329
+ The requirement to provide Installation Information does not include a
330
+ requirement to continue to provide support service, warranty, or updates
331
+ for a work that has been modified or installed by the recipient, or for
332
+ the User Product in which it has been modified or installed. Access to a
333
+ network may be denied when the modification itself materially and
334
+ adversely affects the operation of the network or violates the rules and
335
+ protocols for communication across the network.
336
+
337
+ Corresponding Source conveyed, and Installation Information provided,
338
+ in accord with this section must be in a format that is publicly
339
+ documented (and with an implementation available to the public in
340
+ source code form), and must require no special password or key for
341
+ unpacking, reading or copying.
342
+
343
+ 7. Additional Terms.
344
+
345
+ "Additional permissions" are terms that supplement the terms of this
346
+ License by making exceptions from one or more of its conditions.
347
+ Additional permissions that are applicable to the entire Program shall
348
+ be treated as though they were included in this License, to the extent
349
+ that they are valid under applicable law. If additional permissions
350
+ apply only to part of the Program, that part may be used separately
351
+ under those permissions, but the entire Program remains governed by
352
+ this License without regard to the additional permissions.
353
+
354
+ When you convey a copy of a covered work, you may at your option
355
+ remove any additional permissions from that copy, or from any part of
356
+ it. (Additional permissions may be written to require their own
357
+ removal in certain cases when you modify the work.) You may place
358
+ additional permissions on material, added by you to a covered work,
359
+ for which you have or can give appropriate copyright permission.
360
+
361
+ Notwithstanding any other provision of this License, for material you
362
+ add to a covered work, you may (if authorized by the copyright holders of
363
+ that material) supplement the terms of this License with terms:
364
+
365
+ a) Disclaiming warranty or limiting liability differently from the
366
+ terms of sections 15 and 16 of this License; or
367
+
368
+ b) Requiring preservation of specified reasonable legal notices or
369
+ author attributions in that material or in the Appropriate Legal
370
+ Notices displayed by works containing it; or
371
+
372
+ c) Prohibiting misrepresentation of the origin of that material, or
373
+ requiring that modified versions of such material be marked in
374
+ reasonable ways as different from the original version; or
375
+
376
+ d) Limiting the use for publicity purposes of names of licensors or
377
+ authors of the material; or
378
+
379
+ e) Declining to grant rights under trademark law for use of some
380
+ trade names, trademarks, or service marks; or
381
+
382
+ f) Requiring indemnification of licensors and authors of that
383
+ material by anyone who conveys the material (or modified versions of
384
+ it) with contractual assumptions of liability to the recipient, for
385
+ any liability that these contractual assumptions directly impose on
386
+ those licensors and authors.
387
+
388
+ All other non-permissive additional terms are considered "further
389
+ restrictions" within the meaning of section 10. If the Program as you
390
+ received it, or any part of it, contains a notice stating that it is
391
+ governed by this License along with a term that is a further
392
+ restriction, you may remove that term. If a license document contains
393
+ a further restriction but permits relicensing or conveying under this
394
+ License, you may add to a covered work material governed by the terms
395
+ of that license document, provided that the further restriction does
396
+ not survive such relicensing or conveying.
397
+
398
+ If you add terms to a covered work in accord with this section, you
399
+ must place, in the relevant source files, a statement of the
400
+ additional terms that apply to those files, or a notice indicating
401
+ where to find the applicable terms.
402
+
403
+ Additional terms, permissive or non-permissive, may be stated in the
404
+ form of a separately written license, or stated as exceptions;
405
+ the above requirements apply either way.
406
+
407
+ 8. Termination.
408
+
409
+ You may not propagate or modify a covered work except as expressly
410
+ provided under this License. Any attempt otherwise to propagate or
411
+ modify it is void, and will automatically terminate your rights under
412
+ this License (including any patent licenses granted under the third
413
+ paragraph of section 11).
414
+
415
+ However, if you cease all violation of this License, then your
416
+ license from a particular copyright holder is reinstated (a)
417
+ provisionally, unless and until the copyright holder explicitly and
418
+ finally terminates your license, and (b) permanently, if the copyright
419
+ holder fails to notify you of the violation by some reasonable means
420
+ prior to 60 days after the cessation.
421
+
422
+ Moreover, your license from a particular copyright holder is
423
+ reinstated permanently if the copyright holder notifies you of the
424
+ violation by some reasonable means, this is the first time you have
425
+ received notice of violation of this License (for any work) from that
426
+ copyright holder, and you cure the violation prior to 30 days after
427
+ your receipt of the notice.
428
+
429
+ Termination of your rights under this section does not terminate the
430
+ licenses of parties who have received copies or rights from you under
431
+ this License. If your rights have been terminated and not permanently
432
+ reinstated, you do not qualify to receive new licenses for the same
433
+ material under section 10.
434
+
435
+ 9. Acceptance Not Required for Having Copies.
436
+
437
+ You are not required to accept this License in order to receive or
438
+ run a copy of the Program. Ancillary propagation of a covered work
439
+ occurring solely as a consequence of using peer-to-peer transmission
440
+ to receive a copy likewise does not require acceptance. However,
441
+ nothing other than this License grants you permission to propagate or
442
+ modify any covered work. These actions infringe copyright if you do
443
+ not accept this License. Therefore, by modifying or propagating a
444
+ covered work, you indicate your acceptance of this License to do so.
445
+
446
+ 10. Automatic Licensing of Downstream Recipients.
447
+
448
+ Each time you convey a covered work, the recipient automatically
449
+ receives a license from the original licensors, to run, modify and
450
+ propagate that work, subject to this License. You are not responsible
451
+ for enforcing compliance by third parties with this License.
452
+
453
+ An "entity transaction" is a transaction transferring control of an
454
+ organization, or substantially all assets of one, or subdividing an
455
+ organization, or merging organizations. If propagation of a covered
456
+ work results from an entity transaction, each party to that
457
+ transaction who receives a copy of the work also receives whatever
458
+ licenses to the work the party's predecessor in interest had or could
459
+ give under the previous paragraph, plus a right to possession of the
460
+ Corresponding Source of the work from the predecessor in interest, if
461
+ the predecessor has it or can get it with reasonable efforts.
462
+
463
+ You may not impose any further restrictions on the exercise of the
464
+ rights granted or affirmed under this License. For example, you may
465
+ not impose a license fee, royalty, or other charge for exercise of
466
+ rights granted under this License, and you may not initiate litigation
467
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
468
+ any patent claim is infringed by making, using, selling, offering for
469
+ sale, or importing the Program or any portion of it.
470
+
471
+ 11. Patents.
472
+
473
+ A "contributor" is a copyright holder who authorizes use under this
474
+ License of the Program or a work on which the Program is based. The
475
+ work thus licensed is called the contributor's "contributor version".
476
+
477
+ A contributor's "essential patent claims" are all patent claims
478
+ owned or controlled by the contributor, whether already acquired or
479
+ hereafter acquired, that would be infringed by some manner, permitted
480
+ by this License, of making, using, or selling its contributor version,
481
+ but do not include claims that would be infringed only as a
482
+ consequence of further modification of the contributor version. For
483
+ purposes of this definition, "control" includes the right to grant
484
+ patent sublicenses in a manner consistent with the requirements of
485
+ this License.
486
+
487
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
488
+ patent license under the contributor's essential patent claims, to
489
+ make, use, sell, offer for sale, import and otherwise run, modify and
490
+ propagate the contents of its contributor version.
491
+
492
+ In the following three paragraphs, a "patent license" is any express
493
+ agreement or commitment, however denominated, not to enforce a patent
494
+ (such as an express permission to practice a patent or covenant not to
495
+ sue for patent infringement). To "grant" such a patent license to a
496
+ party means to make such an agreement or commitment not to enforce a
497
+ patent against the party.
498
+
499
+ If you convey a covered work, knowingly relying on a patent license,
500
+ and the Corresponding Source of the work is not available for anyone
501
+ to copy, free of charge and under the terms of this License, through a
502
+ publicly available network server or other readily accessible means,
503
+ then you must either (1) cause the Corresponding Source to be so
504
+ available, or (2) arrange to deprive yourself of the benefit of the
505
+ patent license for this particular work, or (3) arrange, in a manner
506
+ consistent with the requirements of this License, to extend the patent
507
+ license to downstream recipients. "Knowingly relying" means you have
508
+ actual knowledge that, but for the patent license, your conveying the
509
+ covered work in a country, or your recipient's use of the covered work
510
+ in a country, would infringe one or more identifiable patents in that
511
+ country that you have reason to believe are valid.
512
+
513
+ If, pursuant to or in connection with a single transaction or
514
+ arrangement, you convey, or propagate by procuring conveyance of, a
515
+ covered work, and grant a patent license to some of the parties
516
+ receiving the covered work authorizing them to use, propagate, modify
517
+ or convey a specific copy of the covered work, then the patent license
518
+ you grant is automatically extended to all recipients of the covered
519
+ work and works based on it.
520
+
521
+ A patent license is "discriminatory" if it does not include within
522
+ the scope of its coverage, prohibits the exercise of, or is
523
+ conditioned on the non-exercise of one or more of the rights that are
524
+ specifically granted under this License. You may not convey a covered
525
+ work if you are a party to an arrangement with a third party that is
526
+ in the business of distributing software, under which you make payment
527
+ to the third party based on the extent of your activity of conveying
528
+ the work, and under which the third party grants, to any of the
529
+ parties who would receive the covered work from you, a discriminatory
530
+ patent license (a) in connection with copies of the covered work
531
+ conveyed by you (or copies made from those copies), or (b) primarily
532
+ for and in connection with specific products or compilations that
533
+ contain the covered work, unless you entered into that arrangement,
534
+ or that patent license was granted, prior to 28 March 2007.
535
+
536
+ Nothing in this License shall be construed as excluding or limiting
537
+ any implied license or other defenses to infringement that may
538
+ otherwise be available to you under applicable patent law.
539
+
540
+ 12. No Surrender of Others' Freedom.
541
+
542
+ If conditions are imposed on you (whether by court order, agreement or
543
+ otherwise) that contradict the conditions of this License, they do not
544
+ excuse you from the conditions of this License. If you cannot convey a
545
+ covered work so as to satisfy simultaneously your obligations under this
546
+ License and any other pertinent obligations, then as a consequence you may
547
+ not convey it at all. For example, if you agree to terms that obligate you
548
+ to collect a royalty for further conveying from those to whom you convey
549
+ the Program, the only way you could satisfy both those terms and this
550
+ License would be to refrain entirely from conveying the Program.
551
+
552
+ 13. Use with the GNU Affero General Public License.
553
+
554
+ Notwithstanding any other provision of this License, you have
555
+ permission to link or combine any covered work with a work licensed
556
+ under version 3 of the GNU Affero General Public License into a single
557
+ combined work, and to convey the resulting work. The terms of this
558
+ License will continue to apply to the part which is the covered work,
559
+ but the special requirements of the GNU Affero General Public License,
560
+ section 13, concerning interaction through a network will apply to the
561
+ combination as such.
562
+
563
+ 14. Revised Versions of this License.
564
+
565
+ The Free Software Foundation may publish revised and/or new versions of
566
+ the GNU General Public License from time to time. Such new versions will
567
+ be similar in spirit to the present version, but may differ in detail to
568
+ address new problems or concerns.
569
+
570
+ Each version is given a distinguishing version number. If the
571
+ Program specifies that a certain numbered version of the GNU General
572
+ Public License "or any later version" applies to it, you have the
573
+ option of following the terms and conditions either of that numbered
574
+ version or of any later version published by the Free Software
575
+ Foundation. If the Program does not specify a version number of the
576
+ GNU General Public License, you may choose any version ever published
577
+ by the Free Software Foundation.
578
+
579
+ If the Program specifies that a proxy can decide which future
580
+ versions of the GNU General Public License can be used, that proxy's
581
+ public statement of acceptance of a version permanently authorizes you
582
+ to choose that version for the Program.
583
+
584
+ Later license versions may give you additional or different
585
+ permissions. However, no additional obligations are imposed on any
586
+ author or copyright holder as a result of your choosing to follow a
587
+ later version.
588
+
589
+ 15. Disclaimer of Warranty.
590
+
591
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599
+
600
+ 16. Limitation of Liability.
601
+
602
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610
+ SUCH DAMAGES.
611
+
612
+ 17. Interpretation of Sections 15 and 16.
613
+
614
+ If the disclaimer of warranty and limitation of liability provided
615
+ above cannot be given local legal effect according to their terms,
616
+ reviewing courts shall apply local law that most closely approximates
617
+ an absolute waiver of all civil liability in connection with the
618
+ Program, unless a warranty or assumption of liability accompanies a
619
+ copy of the Program in return for a fee.
620
+
621
+ END OF TERMS AND CONDITIONS
622
+
623
+ How to Apply These Terms to Your New Programs
624
+
625
+ If you develop a new program, and you want it to be of the greatest
626
+ possible use to the public, the best way to achieve this is to make it
627
+ free software which everyone can redistribute and change under these terms.
628
+
629
+ To do so, attach the following notices to the program. It is safest
630
+ to attach them to the start of each source file to most effectively
631
+ state the exclusion of warranty; and each file should have at least
632
+ the "copyright" line and a pointer to where the full notice is found.
633
+
634
+ <one line to give the program's name and a brief idea of what it does.>
635
+ Copyright (C) <year> <name of author>
636
+
637
+ This program is free software: you can redistribute it and/or modify
638
+ it under the terms of the GNU General Public License as published by
639
+ the Free Software Foundation, either version 3 of the License, or
640
+ (at your option) any later version.
641
+
642
+ This program is distributed in the hope that it will be useful,
643
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
644
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645
+ GNU General Public License for more details.
646
+
647
+ You should have received a copy of the GNU General Public License
648
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
649
+
650
+ Also add information on how to contact you by electronic and paper mail.
651
+
652
+ If the program does terminal interaction, make it output a short
653
+ notice like this when it starts in an interactive mode:
654
+
655
+ <program> Copyright (C) <year> <name of author>
656
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657
+ This is free software, and you are welcome to redistribute it
658
+ under certain conditions; type `show c' for details.
659
+
660
+ The hypothetical commands `show w' and `show c' should show the appropriate
661
+ parts of the General Public License. Of course, your program's commands
662
+ might be different; for a GUI interface, you would use an "about box".
663
+
664
+ You should also get your employer (if you work as a programmer) or school,
665
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
666
+ For more information on this, and how to apply and follow the GNU GPL, see
667
+ <https://www.gnu.org/licenses/>.
668
+
669
+ The GNU General Public License does not permit incorporating your program
670
+ into proprietary programs. If your program is a subroutine library, you
671
+ may consider it more useful to permit linking proprietary applications with
672
+ the library. If this is what you want to do, use the GNU Lesser General
673
+ Public License instead of this License. But first, please read
674
+ <https://www.gnu.org/licenses/why-not-lgpl.html>.
models/icpr2020dfdc/README.md ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Video Face Manipulation Detection Through Ensemble of CNNs
2
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/video-face-manipulation-detection-through/deepfake-detection-on-dfdc)](https://paperswithcode.com/sota/deepfake-detection-on-dfdc?p=video-face-manipulation-detection-through)
3
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/video-face-manipulation-detection-through/deepfake-detection-on-faceforensics-1)](https://paperswithcode.com/sota/deepfake-detection-on-faceforensics-1?p=video-face-manipulation-detection-through)
4
+ [![Build Status](https://travis-ci.org/polimi-ispl/icpr2020dfdc.svg?branch=master)](https://travis-ci.org/polimi-ispl/icpr2020dfdc)
5
+
6
+ ![](assets/faces_attention.png)
7
+
8
+ <p align='center'>
9
+ <img src='assets/mqzvfufzoq_face.gif'/>
10
+ <img src='assets/mqzvfufzoq_face_att.gif'/>
11
+ </p>
12
+
13
+ This is the official repository of **Video Face Manipulation Detection Through Ensemble of CNNs**,
14
+ presented at [ICPR2020](https://www.micc.unifi.it/icpr2020/) and currently available on [IEEExplore](https://ieeexplore.ieee.org/document/9412711) and [arXiv](https://arxiv.org/abs/2004.07676).
15
+ If you use this repository for your research, please consider citing our paper. Refer to [How to cite](https://github.com/polimi-ispl/icpr2020dfdc#how-to-cite) section to get the correct entry for your bibliography.
16
+
17
+ We participated as the **ISPL** team in the [Kaggle Deepfake Detection Challenge](https://www.kaggle.com/c/deepfake-detection-challenge/).
18
+ With this implementation, we reached the 41st position over 2116 teams (**top 2%**) on the [private leaderboard](https://www.kaggle.com/c/deepfake-detection-challenge/leaderboard).
19
+
20
+ This repository is currently under maintenance, if you are experiencing any problems, please open an [issue](https://github.com/polimi-ispl/icpr2020dfdc/issues).
21
+ ## Getting started
22
+
23
+ ### Prerequisites
24
+ - Install [conda](https://docs.conda.io/en/latest/miniconda.html)
25
+ - Create the `icpr2020` environment with *environment.yml*
26
+ ```bash
27
+ $ conda env create -f environment.yml
28
+ $ conda activate icpr2020
29
+ ```
30
+ - Download and unzip the [datasets](#datasets)
31
+
32
+ ### Quick run
33
+ If you just want to test the pre-trained models against your own videos or images:
34
+ - [Video prediction notebook](https://github.com/polimi-ispl/icpr2020dfdc/blob/master/notebook/Video%20prediction.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/12WnvmerHBNbJ49HdoH1lli_O8SwaFPjv?usp=sharing">
35
+ <img src="https://colab.research.google.com/assets/colab-badge.svg">
36
+ </a>
37
+
38
+ - [Image prediction notebook](https://github.com/polimi-ispl/icpr2020dfdc/blob/master/notebook/Image%20prediction.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/19oVKlzEr58VZfRnSq-nW8kFYuxkh3GM8?usp=sharing">
39
+ <img src="https://colab.research.google.com/assets/colab-badge.svg">
40
+ </a>
41
+
42
+ - [Image prediction with attention](notebook/Image%20prediction%20and%20attention.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/1zcglis2Qx2vtJhrogn8aKA-mbUotLZLK?usp=sharing">
43
+ <img src="https://colab.research.google.com/assets/colab-badge.svg">
44
+ </a>
45
+
46
+ ### The whole pipeline
47
+ You need to preprocess the datasets in order to index all the samples and extract faces. Just run the script [make_dataset.sh](scripts/make_dataset.sh)
48
+
49
+ ```bash
50
+ $ ./scripts/make_dataset.sh
51
+ ```
52
+
53
+ Please note that we use only 32 frames per video. You can easily tweak this parameter in [extract_faces.py](extract_faces.py)
54
+ Also, please note that **for the DFDC** we have resorted to _the training split_ exclusively!
55
+ In `scripts/make_dataset.sh` the value of `DFDC_SRC` should point to the directory containing the DFDC train split.
56
+
57
+
58
+ ### Celeb-DF (v2)
59
+ Altough **we did not use this dataset in the paper**, we provide a script [index_celebdf.py](index_celebdf.py) to index the videos similarly to
60
+ DFDC and FF++. Once you have the index, you can proceed with the pipeline starting from [extract_faces.py](extract_faces.py). You can also use the
61
+ split `celebdf` during training/testing.
62
+
63
+ ### Train
64
+ In [train_all.sh](scripts/train_all.sh) you can find a comprehensive list of all the commands to train the models presented in the paper.
65
+ Please refer to the comments in the script for hints on their usage.
66
+
67
+ #### Training a single model
68
+ If you want to train some models without lunching the script:
69
+ - for the **non-siamese** architectures (e.g. EfficientNetB4, EfficientNetB4Att), you can simply specify the model in [train_binclass.py](train_binclass.py) with the *--net* parameter;
70
+ - for the **siamese** architectures (e.g. EfficientNetB4ST, EfficientNetB4AttST), you have to:
71
+ 1. train the architecture as a feature extractor first, using the [train_triplet.py](train_triplet.py) script and being careful of specifying its name with the *--net* parameter **without** the ST suffix. For instance, for training the EfficientNetB4ST you will have to first run `python train_triplet.py --net EfficientNetB4 --otherparams`;
72
+ 2. finetune the model using [train_binclass.py](train_binclass.py), being careful this time to specify the architecture's name **with** the ST suffix and to insert as *--init* argument the path to the weights of the feature extractor trained at the previous step. You will end up running something like `python train_binclass.py --net EfficientNetB4ST --init path/to/EfficientNetB4/weights/trained/with/train_triplet/weights.pth --otherparams`
73
+
74
+ ### Test
75
+ In [test_all.sh](scripts/test_all.sh) you can find a comprehensive list of all the commands for testing the models presented in the paper.
76
+
77
+ #### Pretrained weights
78
+ We also provide pretrained weights for all the architectures presented in the paper.
79
+ Please refer to this [Dropbox link](https://www.dropbox.com/sh/cesamx5ytd5j08c/AADG_eEmhskliMaT0Gbk-yHDa?dl=0).
80
+ Each directory is named `$NETWORK_$DATASET` where `$NETWORK` is the architecture name and `$DATASET` is the training dataset.
81
+ In each directory, you can find `bestval.pth` which are the best network weights according to the validation set.
82
+
83
+
84
+ Additionally, you can find Jupyter notebooks for results computations in the [notebook](notebook) folder.
85
+
86
+
87
+ ## Datasets
88
+ - [Facebook's DeepFake Detection Challenge (DFDC) train dataset](https://www.kaggle.com/c/deepfake-detection-challenge/data) | [arXiv paper](https://arxiv.org/abs/2006.07397)
89
+ - [FaceForensics++](https://github.com/ondyari/FaceForensics/blob/master/dataset/README.md) | [arXiv paper](https://arxiv.org/abs/1901.08971)
90
+ - [Celeb-DF (v2)](http://www.cs.albany.edu/~lsw/celeb-deepfakeforensics.html) | [arXiv paper](https://arxiv.org/abs/1909.12962) (**Just for reference, not used in the paper**)
91
+
92
+ ## References
93
+ - [EfficientNet PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch)
94
+ - [Xception PyTorch](https://github.com/tstandley/Xception-PyTorch)
95
+
96
+ ## How to cite
97
+ Plain text:
98
+ ```
99
+ N. Bonettini, E. D. Cannas, S. Mandelli, L. Bondi, P. Bestagini and S. Tubaro, "Video Face Manipulation Detection Through Ensemble of CNNs," 2020 25th International Conference on Pattern Recognition (ICPR), 2021, pp. 5012-5019, doi: 10.1109/ICPR48806.2021.9412711.
100
+ ```
101
+
102
+ Bibtex:
103
+ ```bibtex
104
+ @INPROCEEDINGS{9412711,
105
+ author={Bonettini, Nicolò and Cannas, Edoardo Daniele and Mandelli, Sara and Bondi, Luca and Bestagini, Paolo and Tubaro, Stefano},
106
+ booktitle={2020 25th International Conference on Pattern Recognition (ICPR)},
107
+ title={Video Face Manipulation Detection Through Ensemble of CNNs},
108
+ year={2021},
109
+ volume={},
110
+ number={},
111
+ pages={5012-5019},
112
+ doi={10.1109/ICPR48806.2021.9412711}}
113
+ ```
114
+ ## Credits
115
+ [Image and Sound Processing Lab - Politecnico di Milano](http://ispl.deib.polimi.it/)
116
+ - Nicolò Bonettini
117
+ - Edoardo Daniele Cannas
118
+ - Sara Mandelli
119
+ - Luca Bondi
120
+ - Paolo Bestagini
models/icpr2020dfdc/architectures/__init__.py ADDED
File without changes
models/icpr2020dfdc/architectures/externals/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .xception import xception
models/icpr2020dfdc/architectures/externals/xception.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch)
3
+
4
+ @author: tstandley
5
+ Adapted by cadene
6
+
7
+ Creates an Xception Model as defined in:
8
+
9
+ Francois Chollet
10
+ Xception: Deep Learning with Depthwise Separable Convolutions
11
+ https://arxiv.org/pdf/1610.02357.pdf
12
+
13
+ This weights ported from the Keras implementation. Achieves the following performance on the validation set:
14
+
15
+ Loss:0.9173 Prec@1:78.892 Prec@5:94.292
16
+
17
+ REMEMBER to set your image size to 3x299x299 for both test and validation
18
+
19
+ normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
20
+ std=[0.5, 0.5, 0.5])
21
+
22
+ The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
23
+ """
24
+ from __future__ import print_function, division, absolute_import
25
+
26
+ import torch.nn as nn
27
+ import torch.nn.functional as F
28
+ import torch.utils.model_zoo as model_zoo
29
+
30
+ __all__ = ['xception']
31
+
32
+ pretrained_settings = {
33
+ 'xception': {
34
+ 'imagenet': {
35
+ 'url': 'http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth',
36
+ 'input_space': 'RGB',
37
+ 'input_size': [3, 299, 299],
38
+ 'input_range': [0, 1],
39
+ 'mean': [0.5, 0.5, 0.5],
40
+ 'std': [0.5, 0.5, 0.5],
41
+ 'num_classes': 1000,
42
+ 'scale': 0.8975
43
+ # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
44
+ }
45
+ }
46
+ }
47
+
48
+
49
+ class SeparableConv2d(nn.Module):
50
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
51
+ super(SeparableConv2d, self).__init__()
52
+
53
+ self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride, padding, dilation, groups=in_channels,
54
+ bias=bias)
55
+ self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=bias)
56
+
57
+ def forward(self, x):
58
+ x = self.conv1(x)
59
+ x = self.pointwise(x)
60
+ return x
61
+
62
+
63
+ class Block(nn.Module):
64
+ def __init__(self, in_filters, out_filters, reps, strides=1, start_with_relu=True, grow_first=True):
65
+ super(Block, self).__init__()
66
+
67
+ if out_filters != in_filters or strides != 1:
68
+ self.skip = nn.Conv2d(in_filters, out_filters, 1, stride=strides, bias=False)
69
+ self.skipbn = nn.BatchNorm2d(out_filters)
70
+ else:
71
+ self.skip = None
72
+
73
+ rep = []
74
+
75
+ filters = in_filters
76
+ if grow_first:
77
+ rep.append(nn.ReLU(inplace=True))
78
+ rep.append(SeparableConv2d(in_filters, out_filters, 3, stride=1, padding=1, bias=False))
79
+ rep.append(nn.BatchNorm2d(out_filters))
80
+ filters = out_filters
81
+
82
+ for i in range(reps - 1):
83
+ rep.append(nn.ReLU(inplace=True))
84
+ rep.append(SeparableConv2d(filters, filters, 3, stride=1, padding=1, bias=False))
85
+ rep.append(nn.BatchNorm2d(filters))
86
+
87
+ if not grow_first:
88
+ rep.append(nn.ReLU(inplace=True))
89
+ rep.append(SeparableConv2d(in_filters, out_filters, 3, stride=1, padding=1, bias=False))
90
+ rep.append(nn.BatchNorm2d(out_filters))
91
+
92
+ if not start_with_relu:
93
+ rep = rep[1:]
94
+ else:
95
+ rep[0] = nn.ReLU(inplace=False)
96
+
97
+ if strides != 1:
98
+ rep.append(nn.MaxPool2d(3, strides, 1))
99
+ self.rep = nn.Sequential(*rep)
100
+
101
+ def forward(self, inp):
102
+ x = self.rep(inp)
103
+
104
+ if self.skip is not None:
105
+ skip = self.skip(inp)
106
+ skip = self.skipbn(skip)
107
+ else:
108
+ skip = inp
109
+
110
+ x += skip
111
+ return x
112
+
113
+
114
+ class Xception(nn.Module):
115
+ """
116
+ Xception optimized for the ImageNet dataset, as specified in
117
+ https://arxiv.org/pdf/1610.02357.pdf
118
+ """
119
+
120
+ def __init__(self, num_classes=1000):
121
+ """ Constructor
122
+ Args:
123
+ num_classes: number of classes
124
+ """
125
+ super(Xception, self).__init__()
126
+ self.num_classes = num_classes
127
+
128
+ self.conv1 = nn.Conv2d(3, 32, 3, 2, 0, bias=False)
129
+ self.bn1 = nn.BatchNorm2d(32)
130
+ self.relu1 = nn.ReLU(inplace=True)
131
+
132
+ self.conv2 = nn.Conv2d(32, 64, 3, bias=False)
133
+ self.bn2 = nn.BatchNorm2d(64)
134
+ self.relu2 = nn.ReLU(inplace=True)
135
+ # do relu here
136
+
137
+ self.block1 = Block(64, 128, 2, 2, start_with_relu=False, grow_first=True)
138
+ self.block2 = Block(128, 256, 2, 2, start_with_relu=True, grow_first=True)
139
+ self.block3 = Block(256, 728, 2, 2, start_with_relu=True, grow_first=True)
140
+
141
+ self.block4 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
142
+ self.block5 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
143
+ self.block6 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
144
+ self.block7 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
145
+
146
+ self.block8 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
147
+ self.block9 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
148
+ self.block10 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
149
+ self.block11 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
150
+
151
+ self.block12 = Block(728, 1024, 2, 2, start_with_relu=True, grow_first=False)
152
+
153
+ self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1)
154
+ self.bn3 = nn.BatchNorm2d(1536)
155
+ self.relu3 = nn.ReLU(inplace=True)
156
+
157
+ # do relu here
158
+ self.conv4 = SeparableConv2d(1536, 2048, 3, 1, 1)
159
+ self.bn4 = nn.BatchNorm2d(2048)
160
+
161
+ self.fc = nn.Linear(2048, num_classes)
162
+
163
+ # #------- init weights --------
164
+ # for m in self.modules():
165
+ # if isinstance(m, nn.Conv2d):
166
+ # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
167
+ # m.weight.data.normal_(0, math.sqrt(2. / n))
168
+ # elif isinstance(m, nn.BatchNorm2d):
169
+ # m.weight.data.fill_(1)
170
+ # m.bias.data.zero_()
171
+ # #-----------------------------
172
+
173
+ def features(self, input):
174
+ x = self.conv1(input)
175
+ x = self.bn1(x)
176
+ x = self.relu1(x)
177
+
178
+ x = self.conv2(x)
179
+ x = self.bn2(x)
180
+ x = self.relu2(x)
181
+
182
+ x = self.block1(x)
183
+ x = self.block2(x)
184
+ x = self.block3(x)
185
+ x = self.block4(x)
186
+ x = self.block5(x)
187
+ x = self.block6(x)
188
+ x = self.block7(x)
189
+ x = self.block8(x)
190
+ x = self.block9(x)
191
+ x = self.block10(x)
192
+ x = self.block11(x)
193
+ x = self.block12(x)
194
+
195
+ x = self.conv3(x)
196
+ x = self.bn3(x)
197
+ x = self.relu3(x)
198
+
199
+ x = self.conv4(x)
200
+ x = self.bn4(x)
201
+ return x
202
+
203
+ def logits(self, features):
204
+ x = nn.ReLU(inplace=True)(features)
205
+
206
+ x = F.adaptive_avg_pool2d(x, (1, 1))
207
+ x = x.view(x.size(0), -1)
208
+ x = self.last_linear(x)
209
+ return x
210
+
211
+ def forward(self, input):
212
+ x = self.features(input)
213
+ x = self.logits(x)
214
+ return x
215
+
216
+
217
+ def xception(num_classes=1000, pretrained='imagenet'):
218
+ model = Xception(num_classes=num_classes)
219
+ if pretrained:
220
+ settings = pretrained_settings['xception'][pretrained]
221
+ assert num_classes == settings['num_classes'], \
222
+ "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
223
+
224
+ model = Xception(num_classes=num_classes)
225
+ model.load_state_dict(model_zoo.load_url(settings['url']))
226
+
227
+ model.input_space = settings['input_space']
228
+ model.input_size = settings['input_size']
229
+ model.input_range = settings['input_range']
230
+ model.mean = settings['mean']
231
+ model.std = settings['std']
232
+
233
+ # TODO: ugly
234
+ model.last_linear = model.fc
235
+ del model.fc
236
+ return model
models/icpr2020dfdc/architectures/fornet.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+ from collections import OrderedDict
13
+
14
+ import torch
15
+ from efficientnet_pytorch import EfficientNet
16
+ from torch import nn as nn
17
+ from torch.nn import functional as F
18
+ from torchvision import transforms
19
+
20
+ from . import externals
21
+
22
+ """
23
+ Feature Extractor
24
+ """
25
+
26
+
27
+ class FeatureExtractor(nn.Module):
28
+ """
29
+ Abstract class to be extended when supporting features extraction.
30
+ It also provides standard normalized and parameters
31
+ """
32
+
33
+ def features(self, x: torch.Tensor) -> torch.Tensor:
34
+ raise NotImplementedError
35
+
36
+ def get_trainable_parameters(self):
37
+ return self.parameters()
38
+
39
+ @staticmethod
40
+ def get_normalizer():
41
+ return transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
42
+
43
+
44
+ """
45
+ EfficientNet
46
+ """
47
+
48
+
49
+ class EfficientNetGen(FeatureExtractor):
50
+ def __init__(self, model: str):
51
+ super(EfficientNetGen, self).__init__()
52
+
53
+ self.efficientnet = EfficientNet.from_pretrained(model)
54
+ self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
55
+ del self.efficientnet._fc
56
+
57
+ def features(self, x: torch.Tensor) -> torch.Tensor:
58
+ x = self.efficientnet.extract_features(x)
59
+ x = self.efficientnet._avg_pooling(x)
60
+ x = x.flatten(start_dim=1)
61
+ return x
62
+
63
+ def forward(self, x):
64
+ x = self.features(x)
65
+ x = self.efficientnet._dropout(x)
66
+ x = self.classifier(x)
67
+ return x
68
+
69
+
70
+ class EfficientNetB4(EfficientNetGen):
71
+ def __init__(self):
72
+ super(EfficientNetB4, self).__init__(model='efficientnet-b4')
73
+
74
+
75
+ """
76
+ EfficientNetAutoAtt
77
+ """
78
+
79
+
80
+ class EfficientNetAutoAtt(EfficientNet):
81
+ def init_att(self, model: str, width: int):
82
+ """
83
+ Initialize attention
84
+ :param model: efficientnet-bx, x \in {0,..,7}
85
+ :param depth: attention width
86
+ :return:
87
+ """
88
+ if model == 'efficientnet-b4':
89
+ self.att_block_idx = 9
90
+ if width == 0:
91
+ self.attconv = nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)
92
+ else:
93
+ attconv_layers = []
94
+ for i in range(width):
95
+ attconv_layers.append(
96
+ ('conv{:d}'.format(i), nn.Conv2d(kernel_size=3, padding=1, in_channels=56, out_channels=56)))
97
+ attconv_layers.append(
98
+ ('relu{:d}'.format(i), nn.ReLU(inplace=True)))
99
+ attconv_layers.append(('conv_out', nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)))
100
+ self.attconv = nn.Sequential(OrderedDict(attconv_layers))
101
+ else:
102
+ raise ValueError('Model not valid: {}'.format(model))
103
+
104
+ def get_attention(self, x: torch.Tensor) -> torch.Tensor:
105
+
106
+ # Placeholder
107
+ att = None
108
+
109
+ # Stem
110
+ x = self._swish(self._bn0(self._conv_stem(x)))
111
+
112
+ # Blocks
113
+ for idx, block in enumerate(self._blocks):
114
+ drop_connect_rate = self._global_params.drop_connect_rate
115
+ if drop_connect_rate:
116
+ drop_connect_rate *= float(idx) / len(self._blocks)
117
+ x = block(x, drop_connect_rate=drop_connect_rate)
118
+ if idx == self.att_block_idx:
119
+ att = torch.sigmoid(self.attconv(x))
120
+ break
121
+
122
+ return att
123
+
124
+ def extract_features(self, x: torch.Tensor) -> torch.Tensor:
125
+ # Stem
126
+ x = self._swish(self._bn0(self._conv_stem(x)))
127
+
128
+ # Blocks
129
+ for idx, block in enumerate(self._blocks):
130
+ drop_connect_rate = self._global_params.drop_connect_rate
131
+ if drop_connect_rate:
132
+ drop_connect_rate *= float(idx) / len(self._blocks)
133
+ x = block(x, drop_connect_rate=drop_connect_rate)
134
+ if idx == self.att_block_idx:
135
+ att = torch.sigmoid(self.attconv(x))
136
+ x = x * att
137
+
138
+ # Head
139
+ x = self._swish(self._bn1(self._conv_head(x)))
140
+
141
+ return x
142
+
143
+
144
+ class EfficientNetGenAutoAtt(FeatureExtractor):
145
+ def __init__(self, model: str, width: int):
146
+ super(EfficientNetGenAutoAtt, self).__init__()
147
+
148
+ self.efficientnet = EfficientNetAutoAtt.from_pretrained(model)
149
+ self.efficientnet.init_att(model, width)
150
+ self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
151
+ del self.efficientnet._fc
152
+
153
+ def features(self, x: torch.Tensor) -> torch.Tensor:
154
+ x = self.efficientnet.extract_features(x)
155
+ x = self.efficientnet._avg_pooling(x)
156
+ x = x.flatten(start_dim=1)
157
+ return x
158
+
159
+ def forward(self, x):
160
+ x = self.features(x)
161
+ x = self.efficientnet._dropout(x)
162
+ x = self.classifier(x)
163
+ return x
164
+
165
+ def get_attention(self, x: torch.Tensor) -> torch.Tensor:
166
+ return self.efficientnet.get_attention(x)
167
+
168
+
169
+ class EfficientNetAutoAttB4(EfficientNetGenAutoAtt):
170
+ def __init__(self):
171
+ super(EfficientNetAutoAttB4, self).__init__(model='efficientnet-b4', width=0)
172
+
173
+
174
+ """
175
+ Xception
176
+ """
177
+
178
+
179
+ class Xception(FeatureExtractor):
180
+ def __init__(self):
181
+ super(Xception, self).__init__()
182
+ self.xception = externals.xception()
183
+ self.xception.last_linear = nn.Linear(2048, 1)
184
+
185
+ def features(self, x: torch.Tensor) -> torch.Tensor:
186
+ x = self.xception.features(x)
187
+ x = nn.ReLU(inplace=True)(x)
188
+ x = F.adaptive_avg_pool2d(x, (1, 1))
189
+ x = x.view(x.size(0), -1)
190
+ return x
191
+
192
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
193
+ return self.xception.forward(x)
194
+
195
+
196
+ """
197
+ Siamese tuning
198
+ """
199
+
200
+
201
+ class SiameseTuning(FeatureExtractor):
202
+ def __init__(self, feat_ext: FeatureExtractor, num_feat: int, lastonly: bool = True):
203
+ super(SiameseTuning, self).__init__()
204
+ self.feat_ext = feat_ext()
205
+ if not hasattr(self.feat_ext, 'features'):
206
+ raise NotImplementedError('The provided feature extractor needs to provide a features() method')
207
+ self.lastonly = lastonly
208
+ self.classifier = nn.Sequential(
209
+ nn.BatchNorm1d(num_features=num_feat),
210
+ nn.Linear(in_features=num_feat, out_features=1),
211
+ )
212
+
213
+ def features(self, x):
214
+ x = self.feat_ext.features(x)
215
+ return x
216
+
217
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
218
+ if self.lastonly:
219
+ with torch.no_grad():
220
+ x = self.features(x)
221
+ else:
222
+ x = self.features(x)
223
+ x = self.classifier(x)
224
+ return x
225
+
226
+ def get_trainable_parameters(self):
227
+ if self.lastonly:
228
+ return self.classifier.parameters()
229
+ else:
230
+ return self.parameters()
231
+
232
+
233
+ class EfficientNetB4ST(SiameseTuning):
234
+ def __init__(self):
235
+ super(EfficientNetB4ST, self).__init__(feat_ext=EfficientNetB4, num_feat=1792, lastonly=True)
236
+
237
+
238
+ class EfficientNetAutoAttB4ST(SiameseTuning):
239
+ def __init__(self):
240
+ super(EfficientNetAutoAttB4ST, self).__init__(feat_ext=EfficientNetAutoAttB4, num_feat=1792, lastonly=True)
241
+
242
+
243
+ class XceptionST(SiameseTuning):
244
+ def __init__(self):
245
+ super(XceptionST, self).__init__(feat_ext=Xception, num_feat=2048, lastonly=True)
models/icpr2020dfdc/architectures/tripletnet.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+ from . import fornet
13
+ from .fornet import FeatureExtractor
14
+
15
+
16
+ class TripletNet(FeatureExtractor):
17
+ """
18
+ Template class for triplet net
19
+ """
20
+
21
+ def __init__(self, feat_ext: FeatureExtractor):
22
+ super(TripletNet, self).__init__()
23
+ self.feat_ext = feat_ext()
24
+ if not hasattr(self.feat_ext, 'features'):
25
+ raise NotImplementedError('The provided feature extractor needs to provide a features() method')
26
+
27
+ def features(self, x):
28
+ return self.feat_ext.features(x)
29
+
30
+ def forward(self, x1, x2, x3):
31
+ x1 = self.features(x1)
32
+ x2 = self.features(x2)
33
+ x3 = self.features(x3)
34
+ return x1, x2, x3
35
+
36
+
37
+ class EfficientNetB4(TripletNet):
38
+ def __init__(self):
39
+ super(EfficientNetB4, self).__init__(feat_ext=fornet.EfficientNetB4)
40
+
41
+
42
+ class EfficientNetAutoAttB4(TripletNet):
43
+ def __init__(self):
44
+ super(EfficientNetAutoAttB4, self).__init__(feat_ext=fornet.EfficientNetAutoAttB4)
models/icpr2020dfdc/architectures/weights.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+
13
+ weight_url = {
14
+ 'EfficientNetAutoAttB4ST_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4ST_DFDC_bestval-4df0ef7d2f380a5955affa78c35d0942ac1cd65229510353b252737775515a33.pth',
15
+ 'EfficientNetAutoAttB4ST_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4ST_FFPP_bestval-ddb357503b9b902e1b925c2550415604c4252b9b9ecafeb7369dc58cc16e9edd.pth',
16
+ 'EfficientNetAutoAttB4_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4_DFDC_bestval-72ed969b2a395fffe11a0d5bf0a635e7260ba2588c28683630d97ff7153389fc.pth',
17
+ 'EfficientNetAutoAttB4_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4_FFPP_bestval-b0c9e9522a7143cf119843e910234be5e30f77dc527b1b427cdffa5ce3bdbc25.pth',
18
+ 'EfficientNetB4ST_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4ST_DFDC_bestval-86f0a0701b18694dfb5e7837bd09fa8e48a5146c193227edccf59f1b038181c6.pth',
19
+ 'EfficientNetB4ST_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4ST_FFPP_bestval-ccd016668071be5bf5fff68e446d055441739ec7113fb1a6eee998f08396ae92.pth',
20
+ 'EfficientNetB4_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4_DFDC_bestval-c9f3663e2116d3356d056a0ce6453e0fc412a8df68ebd0902f07104d9129a09a.pth',
21
+ 'EfficientNetB4_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4_FFPP_bestval-93aaad84946829e793d1a67ed7e0309b535e2f2395acb4f8d16b92c0616ba8d7.pth',
22
+ 'Xception_DFDC':'https://f002.backblazeb2.com/file/icpr2020/Xception_DFDC_bestval-e826cdb64d73ef491e6b8ff8fce0e1e1b7fc1d8e2715bc51a56280fff17596f9.pth',
23
+ 'Xception_FFPP':'https://f002.backblazeb2.com/file/icpr2020/Xception_FFPP_bestval-bb119e4913cb8f816cd28a03f81f4c603d6351bf8e3f8e3eb99eebc923aecd22.pth',
24
+ }
models/icpr2020dfdc/blazeface/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .blazeface import BlazeFace
2
+ from .face_extract import FaceExtractor
3
+ from .read_video import VideoReader
models/icpr2020dfdc/blazeface/anchors.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
3
+ size 28800
models/icpr2020dfdc/blazeface/blazeface.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54ecff653feaaaf1f7d44b6aff28fd2fc50e483a4e847563b6dd261369c43ba4
3
+ size 420224
models/icpr2020dfdc/blazeface/blazeface.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+
9
+ class BlazeBlock(nn.Module):
10
+ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
11
+ super(BlazeBlock, self).__init__()
12
+
13
+ self.stride = stride
14
+ self.channel_pad = out_channels - in_channels
15
+
16
+ # TFLite uses slightly different padding than PyTorch
17
+ # on the depthwise conv layer when the stride is 2.
18
+ if stride == 2:
19
+ self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
20
+ padding = 0
21
+ else:
22
+ padding = (kernel_size - 1) // 2
23
+
24
+ self.convs = nn.Sequential(
25
+ nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
26
+ kernel_size=kernel_size, stride=stride, padding=padding,
27
+ groups=in_channels, bias=True),
28
+ nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
29
+ kernel_size=1, stride=1, padding=0, bias=True),
30
+ )
31
+
32
+ self.act = nn.ReLU(inplace=True)
33
+
34
+ def forward(self, x):
35
+ if self.stride == 2:
36
+ h = F.pad(x, (0, 2, 0, 2), "constant", 0)
37
+ x = self.max_pool(x)
38
+ else:
39
+ h = x
40
+
41
+ if self.channel_pad > 0:
42
+ x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
43
+
44
+ return self.act(self.convs(h) + x)
45
+
46
+
47
+ class BlazeFace(nn.Module):
48
+ """The BlazeFace face detection model from MediaPipe.
49
+
50
+ The version from MediaPipe is simpler than the one in the paper;
51
+ it does not use the "double" BlazeBlocks.
52
+
53
+ Because we won't be training this model, it doesn't need to have
54
+ batchnorm layers. These have already been "folded" into the conv
55
+ weights by TFLite.
56
+
57
+ The conversion to PyTorch is fairly straightforward, but there are
58
+ some small differences between TFLite and PyTorch in how they handle
59
+ padding on conv layers with stride 2.
60
+
61
+ This version works on batches, while the MediaPipe version can only
62
+ handle a single image at a time.
63
+
64
+ Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
65
+ https://github.com/google/mediapipe/
66
+ """
67
+ input_size = (128, 128)
68
+
69
+ detection_keys = [
70
+ 'ymin', 'xmin', 'ymax', 'xmax',
71
+ 'kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y',
72
+ 'conf'
73
+ ]
74
+
75
+ def __init__(self):
76
+ super(BlazeFace, self).__init__()
77
+
78
+ # These are the settings from the MediaPipe example graph
79
+ # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
80
+ self.num_classes = 1
81
+ self.num_anchors = 896
82
+ self.num_coords = 16
83
+ self.score_clipping_thresh = 100.0
84
+ self.x_scale = 128.0
85
+ self.y_scale = 128.0
86
+ self.h_scale = 128.0
87
+ self.w_scale = 128.0
88
+ self.min_score_thresh = 0.75
89
+ self.min_suppression_threshold = 0.3
90
+
91
+ self._define_layers()
92
+
93
+ def _define_layers(self):
94
+ self.backbone1 = nn.Sequential(
95
+ nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
96
+ nn.ReLU(inplace=True),
97
+
98
+ BlazeBlock(24, 24),
99
+ BlazeBlock(24, 28),
100
+ BlazeBlock(28, 32, stride=2),
101
+ BlazeBlock(32, 36),
102
+ BlazeBlock(36, 42),
103
+ BlazeBlock(42, 48, stride=2),
104
+ BlazeBlock(48, 56),
105
+ BlazeBlock(56, 64),
106
+ BlazeBlock(64, 72),
107
+ BlazeBlock(72, 80),
108
+ BlazeBlock(80, 88),
109
+ )
110
+
111
+ self.backbone2 = nn.Sequential(
112
+ BlazeBlock(88, 96, stride=2),
113
+ BlazeBlock(96, 96),
114
+ BlazeBlock(96, 96),
115
+ BlazeBlock(96, 96),
116
+ BlazeBlock(96, 96),
117
+ )
118
+
119
+ self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
120
+ self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
121
+
122
+ self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
123
+ self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
124
+
125
+ def forward(self, x):
126
+ # TFLite uses slightly different padding on the first conv layer
127
+ # than PyTorch, so do it manually.
128
+ x = F.pad(x, (1, 2, 1, 2), "constant", 0)
129
+
130
+ b = x.shape[0] # batch size, needed for reshaping later
131
+
132
+ x = self.backbone1(x) # (b, 88, 16, 16)
133
+ h = self.backbone2(x) # (b, 96, 8, 8)
134
+
135
+ # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
136
+ # permute the output from the conv layers before reshaping it.
137
+
138
+ c1 = self.classifier_8(x) # (b, 2, 16, 16)
139
+ c1 = c1.permute(0, 2, 3, 1) # (b, 16, 16, 2)
140
+ c1 = c1.reshape(b, -1, 1) # (b, 512, 1)
141
+
142
+ c2 = self.classifier_16(h) # (b, 6, 8, 8)
143
+ c2 = c2.permute(0, 2, 3, 1) # (b, 8, 8, 6)
144
+ c2 = c2.reshape(b, -1, 1) # (b, 384, 1)
145
+
146
+ c = torch.cat((c1, c2), dim=1) # (b, 896, 1)
147
+
148
+ r1 = self.regressor_8(x) # (b, 32, 16, 16)
149
+ r1 = r1.permute(0, 2, 3, 1) # (b, 16, 16, 32)
150
+ r1 = r1.reshape(b, -1, 16) # (b, 512, 16)
151
+
152
+ r2 = self.regressor_16(h) # (b, 96, 8, 8)
153
+ r2 = r2.permute(0, 2, 3, 1) # (b, 8, 8, 96)
154
+ r2 = r2.reshape(b, -1, 16) # (b, 384, 16)
155
+
156
+ r = torch.cat((r1, r2), dim=1) # (b, 896, 16)
157
+ return [r, c]
158
+
159
+ def _device(self):
160
+ """Which device (CPU or GPU) is being used by this model?"""
161
+ return self.classifier_8.weight.device
162
+
163
+ def load_weights(self, path):
164
+ self.load_state_dict(torch.load(path))
165
+ self.eval()
166
+
167
+ def load_anchors(self, path):
168
+ self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
169
+ assert (self.anchors.ndimension() == 2)
170
+ assert (self.anchors.shape[0] == self.num_anchors)
171
+ assert (self.anchors.shape[1] == 4)
172
+
173
+ def _preprocess(self, x):
174
+ """Converts the image pixels to the range [-1, 1]."""
175
+ return x.float() / 127.5 - 1.0
176
+
177
+ def predict_on_image(self, img):
178
+ """Makes a prediction on a single image.
179
+
180
+ Arguments:
181
+ img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
182
+ shape (3, H, W). The image's height and width should be
183
+ 128 pixels.
184
+
185
+ Returns:
186
+ A tensor with face detections.
187
+ """
188
+ if isinstance(img, np.ndarray):
189
+ img = torch.from_numpy(img).permute((2, 0, 1))
190
+
191
+ return self.predict_on_batch(img.unsqueeze(0))[0]
192
+
193
+ def predict_on_batch(self, x: np.ndarray or torch.Tensor, apply_nms: bool = True) -> List[torch.Tensor]:
194
+ """Makes a prediction on a batch of images.
195
+
196
+ Arguments:
197
+ x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
198
+ shape (b, 3, H, W). The height and width should be 128 pixels.
199
+ apply_nms: pass False to not apply non-max suppression
200
+
201
+ Returns:
202
+ A list containing a tensor of face detections for each image in
203
+ the batch. If no faces are found for an image, returns a tensor
204
+ of shape (0, 17).
205
+
206
+ Each face detection is a PyTorch tensor consisting of 17 numbers:
207
+ - ymin, xmin, ymax, xmax
208
+ - x,y-coordinates for the 6 keypoints
209
+ - confidence score
210
+ """
211
+ if isinstance(x, np.ndarray):
212
+ x = torch.from_numpy(x).permute((0, 3, 1, 2))
213
+
214
+ assert x.shape[1] == 3
215
+ assert x.shape[2] == 128
216
+ assert x.shape[3] == 128
217
+
218
+ # 1. Preprocess the images into tensors:
219
+ x = x.to(self._device())
220
+ x = self._preprocess(x)
221
+
222
+ # 2. Run the neural network:
223
+ with torch.no_grad():
224
+ out: torch.Tensor = self.__call__(x)
225
+
226
+ # 3. Postprocess the raw predictions:
227
+ detections = self._tensors_to_detections(out[0], out[1], self.anchors)
228
+
229
+ # 4. Non-maximum suppression to remove overlapping detections:
230
+ return self.nms(detections) if apply_nms else detections
231
+
232
+ def nms(self, detections: List[torch.Tensor]) -> List[torch.Tensor]:
233
+ """Filters out overlapping detections."""
234
+ filtered_detections = []
235
+ for i in range(len(detections)):
236
+ faces = self._weighted_non_max_suppression(detections[i])
237
+ faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, 17), device=self._device())
238
+ filtered_detections.append(faces)
239
+
240
+ return filtered_detections
241
+
242
+ def _tensors_to_detections(self, raw_box_tensor: torch.Tensor, raw_score_tensor: torch.Tensor, anchors) -> List[
243
+ torch.Tensor]:
244
+ """The output of the neural network is a tensor of shape (b, 896, 16)
245
+ containing the bounding box regressor predictions, as well as a tensor
246
+ of shape (b, 896, 1) with the classification confidences.
247
+
248
+ This function converts these two "raw" tensors into proper detections.
249
+ Returns a list of (num_detections, 17) tensors, one for each image in
250
+ the batch.
251
+
252
+ This is based on the source code from:
253
+ mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
254
+ mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
255
+ """
256
+ assert raw_box_tensor.ndimension() == 3
257
+ assert raw_box_tensor.shape[1] == self.num_anchors
258
+ assert raw_box_tensor.shape[2] == self.num_coords
259
+
260
+ assert raw_score_tensor.ndimension() == 3
261
+ assert raw_score_tensor.shape[1] == self.num_anchors
262
+ assert raw_score_tensor.shape[2] == self.num_classes
263
+
264
+ assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
265
+
266
+ detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
267
+
268
+ thresh = self.score_clipping_thresh
269
+ raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
270
+ detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
271
+
272
+ # Note: we stripped off the last dimension from the scores tensor
273
+ # because there is only has one class. Now we can simply use a mask
274
+ # to filter out the boxes with too low confidence.
275
+ mask = detection_scores >= self.min_score_thresh
276
+
277
+ # Because each image from the batch can have a different number of
278
+ # detections, process them one at a time using a loop.
279
+ output_detections = []
280
+ for i in range(raw_box_tensor.shape[0]):
281
+ boxes = detection_boxes[i, mask[i]]
282
+ scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
283
+ output_detections.append(torch.cat((boxes, scores), dim=-1))
284
+
285
+ return output_detections
286
+
287
+ def _decode_boxes(self, raw_boxes, anchors):
288
+ """Converts the predictions into actual coordinates using
289
+ the anchor boxes. Processes the entire batch at once.
290
+ """
291
+ boxes = torch.zeros_like(raw_boxes)
292
+
293
+ x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
294
+ y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
295
+
296
+ w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
297
+ h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
298
+
299
+ boxes[..., 0] = y_center - h / 2. # ymin
300
+ boxes[..., 1] = x_center - w / 2. # xmin
301
+ boxes[..., 2] = y_center + h / 2. # ymax
302
+ boxes[..., 3] = x_center + w / 2. # xmax
303
+
304
+ for k in range(6):
305
+ offset = 4 + k * 2
306
+ keypoint_x = raw_boxes[..., offset] / self.x_scale * anchors[:, 2] + anchors[:, 0]
307
+ keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
308
+ boxes[..., offset] = keypoint_x
309
+ boxes[..., offset + 1] = keypoint_y
310
+
311
+ return boxes
312
+
313
+ def _weighted_non_max_suppression(self, detections):
314
+ """The alternative NMS method as mentioned in the BlazeFace paper:
315
+
316
+ "We replace the suppression algorithm with a blending strategy that
317
+ estimates the regression parameters of a bounding box as a weighted
318
+ mean between the overlapping predictions."
319
+
320
+ The original MediaPipe code assigns the score of the most confident
321
+ detection to the weighted detection, but we take the average score
322
+ of the overlapping detections.
323
+
324
+ The input detections should be a Tensor of shape (count, 17).
325
+
326
+ Returns a list of PyTorch tensors, one for each detected face.
327
+
328
+ This is based on the source code from:
329
+ mediapipe/calculators/util/non_max_suppression_calculator.cc
330
+ mediapipe/calculators/util/non_max_suppression_calculator.proto
331
+ """
332
+ if len(detections) == 0: return []
333
+
334
+ output_detections = []
335
+
336
+ # Sort the detections from highest to lowest score.
337
+ remaining = torch.argsort(detections[:, 16], descending=True)
338
+
339
+ while len(remaining) > 0:
340
+ detection = detections[remaining[0]]
341
+
342
+ # Compute the overlap between the first box and the other
343
+ # remaining boxes. (Note that the other_boxes also include
344
+ # the first_box.)
345
+ first_box = detection[:4]
346
+ other_boxes = detections[remaining, :4]
347
+ ious = overlap_similarity(first_box, other_boxes)
348
+
349
+ # If two detections don't overlap enough, they are considered
350
+ # to be from different faces.
351
+ mask = ious > self.min_suppression_threshold
352
+ overlapping = remaining[mask]
353
+ remaining = remaining[~mask]
354
+
355
+ # Take an average of the coordinates from the overlapping
356
+ # detections, weighted by their confidence scores.
357
+ weighted_detection = detection.clone()
358
+ if len(overlapping) > 1:
359
+ coordinates = detections[overlapping, :16]
360
+ scores = detections[overlapping, 16:17]
361
+ total_score = scores.sum()
362
+ weighted = (coordinates * scores).sum(dim=0) / total_score
363
+ weighted_detection[:16] = weighted
364
+ weighted_detection[16] = total_score / len(overlapping)
365
+
366
+ output_detections.append(weighted_detection)
367
+
368
+ return output_detections
369
+
370
+ # IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
371
+
372
+
373
+ def intersect(box_a, box_b):
374
+ """ We resize both tensors to [A,B,2] without new malloc:
375
+ [A,2] -> [A,1,2] -> [A,B,2]
376
+ [B,2] -> [1,B,2] -> [A,B,2]
377
+ Then we compute the area of intersect between box_a and box_b.
378
+ Args:
379
+ box_a: (tensor) bounding boxes, Shape: [A,4].
380
+ box_b: (tensor) bounding boxes, Shape: [B,4].
381
+ Return:
382
+ (tensor) intersection area, Shape: [A,B].
383
+ """
384
+ A = box_a.size(0)
385
+ B = box_b.size(0)
386
+ max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
387
+ box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
388
+ min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
389
+ box_b[:, :2].unsqueeze(0).expand(A, B, 2))
390
+ inter = torch.clamp((max_xy - min_xy), min=0)
391
+ return inter[:, :, 0] * inter[:, :, 1]
392
+
393
+
394
+ def jaccard(box_a, box_b):
395
+ """Compute the jaccard overlap of two sets of boxes. The jaccard overlap
396
+ is simply the intersection over union of two boxes. Here we operate on
397
+ ground truth boxes and default boxes.
398
+ E.g.:
399
+ A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
400
+ Args:
401
+ box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
402
+ box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
403
+ Return:
404
+ jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
405
+ """
406
+ inter = intersect(box_a, box_b)
407
+ area_a = ((box_a[:, 2] - box_a[:, 0]) *
408
+ (box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
409
+ area_b = ((box_b[:, 2] - box_b[:, 0]) *
410
+ (box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
411
+ union = area_a + area_b - inter
412
+ return inter / union # [A,B]
413
+
414
+
415
+ def overlap_similarity(box, other_boxes):
416
+ """Computes the IOU between a bounding box and set of other boxes."""
417
+ return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
models/icpr2020dfdc/blazeface/face_extract.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Tuple, List
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+ from PIL import Image
8
+
9
+ from blazeface import BlazeFace
10
+
11
+
12
+ class FaceExtractor:
13
+ """Wrapper for face extraction workflow."""
14
+
15
+ def __init__(self, video_read_fn = None, facedet: BlazeFace = None):
16
+ """Creates a new FaceExtractor.
17
+
18
+ Arguments:
19
+ video_read_fn: a function that takes in a path to a video file
20
+ and returns a tuple consisting of a NumPy array with shape
21
+ (num_frames, H, W, 3) and a list of frame indices, or None
22
+ in case of an error
23
+ facedet: the face detector object
24
+ """
25
+ self.video_read_fn = video_read_fn
26
+ self.facedet = facedet
27
+
28
+ def process_image(self, path: str = None, img: Image.Image or np.ndarray = None) -> dict:
29
+ """
30
+ Process a single image
31
+ :param path: Path to the image
32
+ :param img: image
33
+ :return:
34
+ """
35
+
36
+ if img is not None and path is not None:
37
+ raise ValueError('Only one argument between path and img can be specified')
38
+ if img is None and path is None:
39
+ raise ValueError('At least one argument between path and img must be specified')
40
+
41
+ target_size = self.facedet.input_size
42
+
43
+ if img is None:
44
+ img = np.asarray(Image.open(str(path)))
45
+ else:
46
+ img = np.asarray(img)
47
+
48
+ # Split the frames into several tiles. Resize the tiles to 128x128.
49
+ tiles, resize_info = self._tile_frames(np.expand_dims(img, 0), target_size)
50
+ # tiles has shape (num_tiles, target_size, target_size, 3)
51
+ # resize_info is a list of four elements [resize_factor_y, resize_factor_x, 0, 0]
52
+
53
+ # Run the face detector. The result is a list of PyTorch tensors,
54
+ # one for each tile in the batch.
55
+ detections = self.facedet.predict_on_batch(tiles, apply_nms=False)
56
+
57
+ # Convert the detections from 128x128 back to the original frame size.
58
+ detections = self._resize_detections(detections, target_size, resize_info)
59
+
60
+ # Because we have several tiles for each frame, combine the predictions
61
+ # from these tiles. The result is a list of PyTorch tensors, but now one
62
+ # for each frame (rather than each tile).
63
+ num_frames = 1
64
+ frame_size = (img.shape[1], img.shape[0])
65
+ detections = self._untile_detections(num_frames, frame_size, detections)
66
+
67
+ # The same face may have been detected in multiple tiles, so filter out
68
+ # overlapping detections. This is done separately for each frame.
69
+ detections = self.facedet.nms(detections)
70
+
71
+ # Crop the faces out of the original frame.
72
+ frameref_detections = self._add_margin_to_detections(detections[0], frame_size, 0.2)
73
+ faces = self._crop_faces(img, frameref_detections)
74
+ kpts = self._crop_kpts(img, detections[0], 0.3)
75
+
76
+ # Add additional information about the frame and detections.
77
+ scores = list(detections[0][:, 16].cpu().numpy())
78
+ frame_dict = {"frame_w": frame_size[0],
79
+ "frame_h": frame_size[1],
80
+ "faces": faces,
81
+ "kpts": kpts,
82
+ "detections": frameref_detections.cpu().numpy(),
83
+ "scores": scores,
84
+ }
85
+
86
+ # Sort faces by descending confidence
87
+ frame_dict = self._soft_faces_by_descending_score(frame_dict)
88
+
89
+ return frame_dict
90
+
91
+ def _soft_faces_by_descending_score(self, frame_dict: dict) -> dict:
92
+ if len(frame_dict['scores']) > 1:
93
+ sort_idxs = np.argsort(frame_dict['scores'])[::-1]
94
+ new_faces = [frame_dict['faces'][i] for i in sort_idxs]
95
+ new_kpts = [frame_dict['kpts'][i] for i in sort_idxs]
96
+ new_detections = frame_dict['detections'][sort_idxs]
97
+ new_scores = [frame_dict['scores'][i] for i in sort_idxs]
98
+ frame_dict['faces'] = new_faces
99
+ frame_dict['kpts'] = new_kpts
100
+ frame_dict['detections'] = new_detections
101
+ frame_dict['scores'] = new_scores
102
+ return frame_dict
103
+
104
+ def process_videos(self, input_dir, filenames, video_idxs) -> List[dict]:
105
+ """For the specified selection of videos, grabs one or more frames
106
+ from each video, runs the face detector, and tries to find the faces
107
+ in each frame.
108
+
109
+ The frames are split into tiles, and the tiles from the different videos
110
+ are concatenated into a single batch. This means the face detector gets
111
+ a batch of size len(video_idxs) * num_frames * num_tiles (usually 3).
112
+
113
+ Arguments:
114
+ input_dir: base folder where the video files are stored
115
+ filenames: list of all video files in the input_dir
116
+ video_idxs: one or more indices from the filenames list; these
117
+ are the videos we'll actually process
118
+
119
+ Returns a list of dictionaries, one for each frame read from each video.
120
+
121
+ This dictionary contains:
122
+ - video_idx: the video this frame was taken from
123
+ - frame_idx: the index of the frame in the video
124
+ - frame_w, frame_h: original dimensions of the frame
125
+ - faces: a list containing zero or more NumPy arrays with a face crop
126
+ - scores: a list array with the confidence score for each face crop
127
+
128
+ If reading a video failed for some reason, it will not appear in the
129
+ output array. Note that there's no guarantee a given video will actually
130
+ have num_frames results (as soon as a reading problem is encountered for
131
+ a video, we continue with the next video).
132
+ """
133
+ target_size = self.facedet.input_size
134
+
135
+ videos_read = []
136
+ frames_read = []
137
+ frames = []
138
+ tiles = []
139
+ resize_info = []
140
+
141
+ for video_idx in video_idxs:
142
+ # Read the full-size frames from this video.
143
+ filename = filenames[video_idx]
144
+ video_path = os.path.join(input_dir, filename)
145
+ result = self.video_read_fn(video_path)
146
+
147
+ # Error? Then skip this video.
148
+ if result is None: continue
149
+
150
+ videos_read.append(video_idx)
151
+
152
+ # Keep track of the original frames (need them later).
153
+ my_frames, my_idxs = result
154
+ frames.append(my_frames)
155
+ frames_read.append(my_idxs)
156
+
157
+ # Split the frames into several tiles. Resize the tiles to 128x128.
158
+ my_tiles, my_resize_info = self._tile_frames(my_frames, target_size)
159
+ tiles.append(my_tiles)
160
+ resize_info.append(my_resize_info)
161
+
162
+ if len(tiles) == 0:
163
+ return []
164
+ # Put all the tiles for all the frames from all the videos into
165
+ # a single batch.
166
+ batch = np.concatenate(tiles)
167
+
168
+ # Run the face detector. The result is a list of PyTorch tensors,
169
+ # one for each image in the batch.
170
+ all_detections = self.facedet.predict_on_batch(batch, apply_nms=False)
171
+
172
+ result = []
173
+ offs = 0
174
+ for v in range(len(tiles)):
175
+ # Not all videos may have the same number of tiles, so find which
176
+ # detections go with which video.
177
+ num_tiles = tiles[v].shape[0]
178
+ detections = all_detections[offs:offs + num_tiles]
179
+ offs += num_tiles
180
+
181
+ # Convert the detections from 128x128 back to the original frame size.
182
+ detections = self._resize_detections(detections, target_size, resize_info[v])
183
+
184
+ # Because we have several tiles for each frame, combine the predictions
185
+ # from these tiles. The result is a list of PyTorch tensors, but now one
186
+ # for each frame (rather than each tile).
187
+ num_frames = frames[v].shape[0]
188
+ frame_size = (frames[v].shape[2], frames[v].shape[1])
189
+ detections = self._untile_detections(num_frames, frame_size, detections)
190
+
191
+ # The same face may have been detected in multiple tiles, so filter out
192
+ # overlapping detections. This is done separately for each frame.
193
+ detections = self.facedet.nms(detections)
194
+
195
+ for i in range(len(detections)):
196
+ # Crop the faces out of the original frame.
197
+ frameref_detections = self._add_margin_to_detections(detections[i], frame_size, 0.2)
198
+ faces = self._crop_faces(frames[v][i], frameref_detections)
199
+ kpts = self._crop_kpts(frames[v][i], detections[i], 0.3)
200
+
201
+ # Add additional information about the frame and detections.
202
+ scores = list(detections[i][:, 16].cpu().numpy())
203
+ frame_dict = {"video_idx": videos_read[v],
204
+ "frame_idx": frames_read[v][i],
205
+ "frame_w": frame_size[0],
206
+ "frame_h": frame_size[1],
207
+ "frame": frames[v][i],
208
+ "faces": faces,
209
+ "kpts": kpts,
210
+ "detections": frameref_detections.cpu().numpy(),
211
+ "scores": scores,
212
+ }
213
+ # Sort faces by descending confidence
214
+ frame_dict = self._soft_faces_by_descending_score(frame_dict)
215
+
216
+ result.append(frame_dict)
217
+
218
+ return result
219
+
220
+ def process_video(self, video_path):
221
+ """Convenience method for doing face extraction on a single video."""
222
+ input_dir = os.path.dirname(video_path)
223
+ filenames = [os.path.basename(video_path)]
224
+ return self.process_videos(input_dir, filenames, [0])
225
+
226
+ def _tile_frames(self, frames: np.ndarray, target_size: Tuple[int, int]) -> (np.ndarray, List[float]):
227
+ """Splits each frame into several smaller, partially overlapping tiles
228
+ and resizes each tile to target_size.
229
+
230
+ After a bunch of experimentation, I found that for a 1920x1080 video,
231
+ BlazeFace works better on three 1080x1080 windows. These overlap by 420
232
+ pixels. (Two windows also work but it's best to have a clean center crop
233
+ in there as well.)
234
+
235
+ I also tried 6 windows of size 720x720 (horizontally: 720|360, 360|720;
236
+ vertically: 720|1200, 480|720|480, 1200|720) but that gives many false
237
+ positives when a window has no face in it.
238
+
239
+ For a video in portrait orientation (1080x1920), we only take a single
240
+ crop of the top-most 1080 pixels. If we split up the video vertically,
241
+ then we might get false positives again.
242
+
243
+ (NOTE: Not all videos are necessarily 1080p but the code can handle this.)
244
+
245
+ Arguments:
246
+ frames: NumPy array of shape (num_frames, height, width, 3)
247
+ target_size: (width, height)
248
+
249
+ Returns:
250
+ - a new (num_frames * N, target_size[1], target_size[0], 3) array
251
+ where N is the number of tiles used.
252
+ - a list [scale_w, scale_h, offset_x, offset_y] that describes how
253
+ to map the resized and cropped tiles back to the original image
254
+ coordinates. This is needed for scaling up the face detections
255
+ from the smaller image to the original image, so we can take the
256
+ face crops in the original coordinate space.
257
+ """
258
+ num_frames, H, W, _ = frames.shape
259
+
260
+ num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)
261
+
262
+ splits = np.zeros((num_frames * num_v * num_h, target_size[1], target_size[0], 3), dtype=np.uint8)
263
+
264
+ i = 0
265
+ for f in range(num_frames):
266
+ y = 0
267
+ for v in range(num_v):
268
+ x = 0
269
+ for h in range(num_h):
270
+ crop = frames[f, y:y + split_size, x:x + split_size, :]
271
+ splits[i] = cv2.resize(crop, target_size, interpolation=cv2.INTER_AREA)
272
+ x += x_step
273
+ i += 1
274
+ y += y_step
275
+
276
+ resize_info = [split_size / target_size[0], split_size / target_size[1], 0, 0]
277
+ return splits, resize_info
278
+
279
+ def get_tiles_params(self, H, W):
280
+ split_size = min(H, W, 720)
281
+ x_step = (W - split_size) // 2
282
+ y_step = (H - split_size) // 2
283
+ num_v = (H - split_size) // y_step + 1 if y_step > 0 else 1
284
+ num_h = (W - split_size) // x_step + 1 if x_step > 0 else 1
285
+ return num_h, num_v, split_size, x_step, y_step
286
+
287
+ def _resize_detections(self, detections, target_size, resize_info):
288
+ """Converts a list of face detections back to the original
289
+ coordinate system.
290
+
291
+ Arguments:
292
+ detections: a list containing PyTorch tensors of shape (num_faces, 17)
293
+ target_size: (width, height)
294
+ resize_info: [scale_w, scale_h, offset_x, offset_y]
295
+ """
296
+ projected = []
297
+ target_w, target_h = target_size
298
+ scale_w, scale_h, offset_x, offset_y = resize_info
299
+
300
+ for i in range(len(detections)):
301
+ detection = detections[i].clone()
302
+
303
+ # ymin, xmin, ymax, xmax
304
+ for k in range(2):
305
+ detection[:, k * 2] = (detection[:, k * 2] * target_h - offset_y) * scale_h
306
+ detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_w - offset_x) * scale_w
307
+
308
+ # keypoints are x,y
309
+ for k in range(2, 8):
310
+ detection[:, k * 2] = (detection[:, k * 2] * target_w - offset_x) * scale_w
311
+ detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_h - offset_y) * scale_h
312
+
313
+ projected.append(detection)
314
+
315
+ return projected
316
+
317
+ def _untile_detections(self, num_frames: int, frame_size: Tuple[int, int], detections: List[torch.Tensor]) -> List[
318
+ torch.Tensor]:
319
+ """With N tiles per frame, there also are N times as many detections.
320
+ This function groups together the detections for a given frame; it is
321
+ the complement to tile_frames().
322
+ """
323
+ combined_detections = []
324
+
325
+ W, H = frame_size
326
+
327
+ num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)
328
+
329
+ i = 0
330
+ for f in range(num_frames):
331
+ detections_for_frame = []
332
+ y = 0
333
+ for v in range(num_v):
334
+ x = 0
335
+ for h in range(num_h):
336
+ # Adjust the coordinates based on the split positions.
337
+ detection = detections[i].clone()
338
+ if detection.shape[0] > 0:
339
+ for k in range(2):
340
+ detection[:, k * 2] += y
341
+ detection[:, k * 2 + 1] += x
342
+ for k in range(2, 8):
343
+ detection[:, k * 2] += x
344
+ detection[:, k * 2 + 1] += y
345
+
346
+ detections_for_frame.append(detection)
347
+ x += x_step
348
+ i += 1
349
+ y += y_step
350
+
351
+ combined_detections.append(torch.cat(detections_for_frame))
352
+
353
+ return combined_detections
354
+
355
+ def _add_margin_to_detections(self, detections: torch.Tensor, frame_size: Tuple[int, int],
356
+ margin: float = 0.2) -> torch.Tensor:
357
+ """Expands the face bounding box.
358
+
359
+ NOTE: The face detections often do not include the forehead, which
360
+ is why we use twice the margin for ymin.
361
+
362
+ Arguments:
363
+ detections: a PyTorch tensor of shape (num_detections, 17)
364
+ frame_size: maximum (width, height)
365
+ margin: a percentage of the bounding box's height
366
+
367
+ Returns a PyTorch tensor of shape (num_detections, 17).
368
+ """
369
+ offset = torch.round(margin * (detections[:, 2] - detections[:, 0]))
370
+ detections = detections.clone()
371
+ detections[:, 0] = torch.clamp(detections[:, 0] - offset * 2, min=0) # ymin
372
+ detections[:, 1] = torch.clamp(detections[:, 1] - offset, min=0) # xmin
373
+ detections[:, 2] = torch.clamp(detections[:, 2] + offset, max=frame_size[1]) # ymax
374
+ detections[:, 3] = torch.clamp(detections[:, 3] + offset, max=frame_size[0]) # xmax
375
+ return detections
376
+
377
+ def _crop_faces(self, frame: np.ndarray, detections: torch.Tensor) -> List[np.ndarray]:
378
+ """Copies the face region(s) from the given frame into a set
379
+ of new NumPy arrays.
380
+
381
+ Arguments:
382
+ frame: a NumPy array of shape (H, W, 3)
383
+ detections: a PyTorch tensor of shape (num_detections, 17)
384
+
385
+ Returns a list of NumPy arrays, one for each face crop. If there
386
+ are no faces detected for this frame, returns an empty list.
387
+ """
388
+ faces = []
389
+ for i in range(len(detections)):
390
+ ymin, xmin, ymax, xmax = detections[i, :4].cpu().numpy().astype(int)
391
+ face = frame[ymin:ymax, xmin:xmax, :]
392
+ faces.append(face)
393
+ return faces
394
+
395
+ def _crop_kpts(self, frame: np.ndarray, detections: torch.Tensor, face_fraction: float):
396
+ """Copies the parts region(s) from the given frame into a set
397
+ of new NumPy arrays.
398
+
399
+ Arguments:
400
+ frame: a NumPy array of shape (H, W, 3)
401
+ detections: a PyTorch tensor of shape (num_detections, 17)
402
+ face_fraction: float between 0 and 1 indicating how big are the parts to be extracted w.r.t the whole face
403
+
404
+ Returns a list of NumPy arrays, one for each face crop. If there
405
+ are no faces detected for this frame, returns an empty list.
406
+ """
407
+ faces = []
408
+ for i in range(len(detections)):
409
+ kpts = []
410
+ size = int(face_fraction * min(detections[i, 2] - detections[i, 0], detections[i, 3] - detections[i, 1]))
411
+ kpts_coords = detections[i, 4:16].cpu().numpy().astype(int)
412
+ for kpidx in range(6):
413
+ kpx, kpy = kpts_coords[kpidx * 2:kpidx * 2 + 2]
414
+ kpt = frame[kpy - size // 2:kpy - size // 2 + size, kpx - size // 2:kpx - size // 2 + size, ]
415
+ kpts.append(kpt)
416
+ faces.append(kpts)
417
+ return faces
418
+
419
+ def remove_large_crops(self, crops, pct=0.1):
420
+ """Removes faces from the results if they take up more than X%
421
+ of the video. Such a face is likely a false positive.
422
+
423
+ This is an optional postprocessing step. Modifies the original
424
+ data structure.
425
+
426
+ Arguments:
427
+ crops: a list of dictionaries with face crop data
428
+ pct: maximum portion of the frame a crop may take up
429
+ """
430
+ for i in range(len(crops)):
431
+ frame_data = crops[i]
432
+ video_area = frame_data["frame_w"] * frame_data["frame_h"]
433
+ faces = frame_data["faces"]
434
+ scores = frame_data["scores"]
435
+ new_faces = []
436
+ new_scores = []
437
+ for j in range(len(faces)):
438
+ face = faces[j]
439
+ face_H, face_W, _ = face.shape
440
+ face_area = face_H * face_W
441
+ if face_area / video_area < 0.1:
442
+ new_faces.append(face)
443
+ new_scores.append(scores[j])
444
+ frame_data["faces"] = new_faces
445
+ frame_data["scores"] = new_scores
446
+
447
+ def keep_only_best_face(self, crops):
448
+ """For each frame, only keeps the face with the highest confidence.
449
+
450
+ This gets rid of false positives, but obviously is problematic for
451
+ videos with two people!
452
+
453
+ This is an optional postprocessing step. Modifies the original
454
+ data structure.
455
+ """
456
+ for i in range(len(crops)):
457
+ frame_data = crops[i]
458
+ if len(frame_data["faces"]) > 0:
459
+ frame_data["faces"] = frame_data["faces"][:1]
460
+ frame_data["scores"] = frame_data["scores"][:1]
461
+
462
+ # TODO: def filter_likely_false_positives(self, crops):
463
+ # if only some frames have more than 1 face, it's likely a false positive
464
+ # if most frames have more than 1 face, it's probably two people
465
+ # so find the % of frames with > 1 face; if > 0.X, keep the two best faces
466
+
467
+ # TODO: def filter_by_score(self, crops, min_score) to remove any
468
+ # crops with a confidence score lower than min_score
469
+
470
+ # TODO: def sort_by_histogram(self, crops) for videos with 2 people.
models/icpr2020dfdc/blazeface/read_video.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+
5
+ class VideoReader:
6
+ """Helper class for reading one or more frames from a video file."""
7
+
8
+ def __init__(self, verbose=True, insets=(0, 0)):
9
+ """Creates a new VideoReader.
10
+
11
+ Arguments:
12
+ verbose: whether to print warnings and error messages
13
+ insets: amount to inset the image by, as a percentage of
14
+ (width, height). This lets you "zoom in" to an image
15
+ to remove unimportant content around the borders.
16
+ Useful for face detection, which may not work if the
17
+ faces are too small.
18
+ """
19
+ self.verbose = verbose
20
+ self.insets = insets
21
+
22
+ def read_frames(self, path, num_frames, jitter=0, seed=None):
23
+ """Reads frames that are always evenly spaced throughout the video.
24
+
25
+ Arguments:
26
+ path: the video file
27
+ num_frames: how many frames to read, -1 means the entire video
28
+ (warning: this will take up a lot of memory!)
29
+ jitter: if not 0, adds small random offsets to the frame indices;
30
+ this is useful so we don't always land on even or odd frames
31
+ seed: random seed for jittering; if you set this to a fixed value,
32
+ you probably want to set it only on the first video
33
+ """
34
+ assert num_frames > 0
35
+
36
+ capture = cv2.VideoCapture(path)
37
+ frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
38
+ if frame_count <= 0: return None
39
+
40
+ frame_idxs = np.linspace(0, frame_count - 1, num_frames, endpoint=True, dtype=int)
41
+ frame_idxs = np.unique(frame_idxs) # Avoid repeating frame idxs otherwise it breaks reading
42
+ if jitter > 0:
43
+ np.random.seed(seed)
44
+ jitter_offsets = np.random.randint(-jitter, jitter, len(frame_idxs))
45
+ frame_idxs = np.clip(frame_idxs + jitter_offsets, 0, frame_count - 1)
46
+
47
+ result = self._read_frames_at_indices(path, capture, frame_idxs)
48
+ capture.release()
49
+ return result
50
+
51
+ def read_random_frames(self, path, num_frames, seed=None):
52
+ """Picks the frame indices at random.
53
+
54
+ Arguments:
55
+ path: the video file
56
+ num_frames: how many frames to read, -1 means the entire video
57
+ (warning: this will take up a lot of memory!)
58
+ """
59
+ assert num_frames > 0
60
+ np.random.seed(seed)
61
+
62
+ capture = cv2.VideoCapture(path)
63
+ frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
64
+ if frame_count <= 0: return None
65
+
66
+ frame_idxs = sorted(np.random.choice(np.arange(0, frame_count), num_frames))
67
+ result = self._read_frames_at_indices(path, capture, frame_idxs)
68
+
69
+ capture.release()
70
+ return result
71
+
72
+ def read_frames_at_indices(self, path, frame_idxs):
73
+ """Reads frames from a video and puts them into a NumPy array.
74
+
75
+ Arguments:
76
+ path: the video file
77
+ frame_idxs: a list of frame indices. Important: should be
78
+ sorted from low-to-high! If an index appears multiple
79
+ times, the frame is still read only once.
80
+
81
+ Returns:
82
+ - a NumPy array of shape (num_frames, height, width, 3)
83
+ - a list of the frame indices that were read
84
+
85
+ Reading stops if loading a frame fails, in which case the first
86
+ dimension returned may actually be less than num_frames.
87
+
88
+ Returns None if an exception is thrown for any reason, or if no
89
+ frames were read.
90
+ """
91
+ assert len(frame_idxs) > 0
92
+ capture = cv2.VideoCapture(path)
93
+ result = self._read_frames_at_indices(path, capture, frame_idxs)
94
+ capture.release()
95
+ return result
96
+
97
+ def _read_frames_at_indices(self, path, capture, frame_idxs):
98
+ try:
99
+ frames = []
100
+ idxs_read = []
101
+ for frame_idx in range(frame_idxs[0], frame_idxs[-1] + 1):
102
+ # Get the next frame, but don't decode if we're not using it.
103
+ ret = capture.grab()
104
+ if not ret:
105
+ if self.verbose:
106
+ print("Error grabbing frame %d from movie %s" % (frame_idx, path))
107
+ break
108
+
109
+ # Need to look at this frame?
110
+ current = len(idxs_read)
111
+ if frame_idx == frame_idxs[current]:
112
+ ret, frame = capture.retrieve()
113
+ if not ret or frame is None:
114
+ if self.verbose:
115
+ print("Error retrieving frame %d from movie %s" % (frame_idx, path))
116
+ break
117
+
118
+ frame = self._postprocess_frame(frame)
119
+ frames.append(frame)
120
+ idxs_read.append(frame_idx)
121
+
122
+ if len(frames) > 0:
123
+ return np.stack(frames), idxs_read
124
+ if self.verbose:
125
+ print("No frames read from movie %s" % path)
126
+ return None
127
+ except:
128
+ if self.verbose:
129
+ print("Exception while reading movie %s" % path)
130
+ return None
131
+
132
+ def read_middle_frame(self, path):
133
+ """Reads the frame from the middle of the video."""
134
+ capture = cv2.VideoCapture(path)
135
+ frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
136
+ result = self._read_frame_at_index(path, capture, frame_count // 2)
137
+ capture.release()
138
+ return result
139
+
140
+ def read_frame_at_index(self, path, frame_idx):
141
+ """Reads a single frame from a video.
142
+
143
+ If you just want to read a single frame from the video, this is more
144
+ efficient than scanning through the video to find the frame. However,
145
+ for reading multiple frames it's not efficient.
146
+
147
+ My guess is that a "streaming" approach is more efficient than a
148
+ "random access" approach because, unless you happen to grab a keyframe,
149
+ the decoder still needs to read all the previous frames in order to
150
+ reconstruct the one you're asking for.
151
+
152
+ Returns a NumPy array of shape (1, H, W, 3) and the index of the frame,
153
+ or None if reading failed.
154
+ """
155
+ capture = cv2.VideoCapture(path)
156
+ result = self._read_frame_at_index(path, capture, frame_idx)
157
+ capture.release()
158
+ return result
159
+
160
+ def _read_frame_at_index(self, path, capture, frame_idx):
161
+ capture.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
162
+ ret, frame = capture.read()
163
+ if not ret or frame is None:
164
+ if self.verbose:
165
+ print("Error retrieving frame %d from movie %s" % (frame_idx, path))
166
+ return None
167
+ else:
168
+ frame = self._postprocess_frame(frame)
169
+ return np.expand_dims(frame, axis=0), [frame_idx]
170
+
171
+ def _postprocess_frame(self, frame):
172
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
173
+
174
+ if self.insets[0] > 0:
175
+ W = frame.shape[1]
176
+ p = int(W * self.insets[0])
177
+ frame = frame[:, p:-p, :]
178
+
179
+ if self.insets[1] > 0:
180
+ H = frame.shape[1]
181
+ q = int(H * self.insets[1])
182
+ frame = frame[q:-q, :, :]
183
+
184
+ return frame
185
+
186
+
187
+ class VideoReaderIspl(VideoReader):
188
+ """
189
+ Derived VideoReader class with overriden read_frames method
190
+ """
191
+
192
+ def read_frames_with_hop(self, path: str, num_frames: int = -1, fps: int = -1):
193
+ """Reads frames up to a certain number spaced throughout the video with a rate decided by the user.
194
+
195
+ Arguments:
196
+ path: the video file
197
+ num_frames: how many frames to read, -1 means the entire video
198
+ (warning: this will take up a lot of memory!)
199
+ fps: how many frames per second to pick
200
+ """
201
+ assert num_frames > 0
202
+
203
+ capture = cv2.VideoCapture(path)
204
+ frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
205
+ if frame_count <= 0: return None
206
+ video_rate = capture.get(cv2.CAP_PROP_FPS)
207
+ hop = 1 if fps == -1 else max(video_rate // fps, 1)
208
+ end_pts = frame_count if num_frames == -1 else num_frames * hop
209
+ frame_idxs = np.arange(0, end_pts - 1, hop, endpoint=True, dtype=int)
210
+
211
+ result = self._read_frames_at_indices(path, capture, frame_idxs)
212
+ capture.release()
213
+ return result
models/icpr2020dfdc/environment.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: icpr2020
2
+ channels:
3
+ - pytorch
4
+ - conda-forge
5
+ - defaults
6
+ dependencies:
7
+ - av=6.2.0
8
+ - albumentations
9
+ - cudatoolkit
10
+ - ffmpeg
11
+ - jupyter
12
+ - numpy
13
+ - opencv=3.4.2
14
+ - py-opencv=3.4.2
15
+ - python=3.6.9
16
+ - pip
17
+ - pytorch=1.4.0
18
+ - torchvision
19
+ - tqdm
20
+ - pandas
21
+ - pip:
22
+ - tensorboardx==2.0
23
+ - efficientnet-pytorch
24
+ - scikit-learn
25
+
models/icpr2020dfdc/extract_faces.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extract faces
3
+
4
+ Video Face Manipulation Detection Through Ensemble of CNNs
5
+
6
+ Image and Sound Processing Lab - Politecnico di Milano
7
+
8
+ Nicolò Bonettini
9
+ Edoardo Daniele Cannas
10
+ Sara Mandelli
11
+ Luca Bondi
12
+ Paolo Bestagini
13
+ """
14
+ import argparse
15
+ import sys
16
+ import traceback
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ from functools import partial
19
+ from pathlib import Path
20
+ from typing import Tuple, List
21
+
22
+ import numpy as np
23
+ import pandas as pd
24
+ import torch
25
+ import torch.cuda
26
+ from PIL import Image
27
+ from tqdm import tqdm
28
+
29
+ import blazeface
30
+ from blazeface import BlazeFace, VideoReader, FaceExtractor
31
+ from isplutils.utils import adapt_bb
32
+
33
+
34
+ def parse_args(argv):
35
+ parser = argparse.ArgumentParser()
36
+ parser.add_argument('--source', type=Path, help='Videos root directory', required=True)
37
+ parser.add_argument('--videodf', type=Path, help='Path to read the videos DataFrame', required=True)
38
+ parser.add_argument('--facesfolder', type=Path, help='Faces output root directory', required=True)
39
+ parser.add_argument('--facesdf', type=Path, help='Path to save the output DataFrame of faces', required=True)
40
+ parser.add_argument('--checkpoint', type=Path, help='Path to save the temporary per-video outputs', required=True)
41
+
42
+ parser.add_argument('--fpv', type=int, default=32, help='Frames per video')
43
+ parser.add_argument('--device', type=torch.device,
44
+ default=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
45
+ help='Device to use for face extraction')
46
+ parser.add_argument('--collateonly', help='Only perform collation of pre-existing results', action='store_true')
47
+ parser.add_argument('--noindex', help='Do not rebuild the index', action='store_false')
48
+ parser.add_argument('--batch', type=int, help='Batch size', default=16)
49
+ parser.add_argument('--threads', type=int, help='Number of threads', default=8)
50
+ parser.add_argument('--offset', type=int, help='Offset to start extraction', default=0)
51
+ parser.add_argument('--num', type=int, help='Number of videos to process', default=0)
52
+ parser.add_argument('--lazycheck', action='store_true', help='Lazy check of existing video indexes')
53
+ parser.add_argument('--deepcheck', action='store_true', help='Try to open every image')
54
+
55
+ return parser.parse_args(argv)
56
+
57
+
58
+ def main(argv):
59
+ args = parse_args(argv)
60
+
61
+ ## Parameters parsing
62
+ device: torch.device = args.device
63
+ source_dir: Path = args.source
64
+ facedestination_dir: Path = args.facesfolder
65
+ frames_per_video: int = args.fpv
66
+ videodataset_path: Path = args.videodf
67
+ facesdataset_path: Path = args.facesdf
68
+ collateonly: bool = args.collateonly
69
+ batch_size: int = args.batch
70
+ threads: int = args.threads
71
+ offset: int = args.offset
72
+ num: int = args.num
73
+ lazycheck: bool = args.lazycheck
74
+ deepcheck: bool = args.deepcheck
75
+ checkpoint_folder: Path = args.checkpoint
76
+ index_enable: bool = args.noindex
77
+
78
+ ## Parameters
79
+ face_size = 512
80
+
81
+ print('Loading video DataFrame')
82
+ df_videos = pd.read_pickle(videodataset_path)
83
+
84
+ if num > 0:
85
+ df_videos_process = df_videos.iloc[offset:offset + num]
86
+ else:
87
+ df_videos_process = df_videos.iloc[offset:]
88
+
89
+ if not collateonly:
90
+
91
+ ## Blazeface loading
92
+ print('Loading face extractor')
93
+ facedet = BlazeFace().to(device)
94
+ facedet.load_weights("blazeface/blazeface.pth")
95
+ facedet.load_anchors("blazeface/anchors.npy")
96
+ videoreader = VideoReader(verbose=False)
97
+ video_read_fn = lambda x: videoreader.read_frames(x, num_frames=frames_per_video)
98
+ face_extractor = FaceExtractor(video_read_fn, facedet)
99
+
100
+ ## Face extraction
101
+ with ThreadPoolExecutor(threads) as p:
102
+ for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos_process), step=batch_size),
103
+ desc='Extracting faces'):
104
+ tosave_list = list(p.map(partial(process_video,
105
+ source_dir=source_dir,
106
+ facedestination_dir=facedestination_dir,
107
+ checkpoint_folder=checkpoint_folder,
108
+ face_size=face_size,
109
+ face_extractor=face_extractor,
110
+ lazycheck=lazycheck,
111
+ deepcheck=deepcheck,
112
+ ),
113
+ df_videos_process.iloc[batch_idx0:batch_idx0 + batch_size].iterrows()))
114
+
115
+ for tosave in tosave_list:
116
+ if tosave is not None:
117
+ if len(tosave[2]):
118
+ list(p.map(save_jpg, tosave[2]))
119
+ tosave[1].parent.mkdir(parents=True, exist_ok=True)
120
+ tosave[0].to_pickle(str(tosave[1]))
121
+
122
+ if index_enable:
123
+ # Collect checkpoints
124
+ df_videos['nfaces'] = np.zeros(len(df_videos), np.uint8)
125
+ faces_dataset = []
126
+ for idx, record in tqdm(df_videos.iterrows(), total=len(df_videos), desc='Collecting faces results'):
127
+ # Checkpoint
128
+ video_face_checkpoint_path = checkpoint_folder.joinpath(record['path']).with_suffix('.faces.pkl')
129
+ if video_face_checkpoint_path.exists():
130
+ try:
131
+ df_video_faces = pd.read_pickle(str(video_face_checkpoint_path))
132
+ # Fix same attribute issue
133
+ df_video_faces = df_video_faces.rename(columns={'subject': 'videosubject'}, errors='ignore')
134
+ nfaces = len(
135
+ np.unique(df_video_faces.index.map(lambda x: int(x.split('_subj')[1].split('.jpg')[0]))))
136
+ df_videos.loc[idx, 'nfaces'] = nfaces
137
+ faces_dataset.append(df_video_faces)
138
+ except Exception as e:
139
+ print('Error while reading: {}'.format(video_face_checkpoint_path))
140
+ print(e)
141
+ video_face_checkpoint_path.unlink()
142
+
143
+ if len(faces_dataset) == 0:
144
+ raise ValueError(f'No checkpoint found from face extraction. '
145
+ f'Is the the source path {source_dir} correct for the videos in your dataframe?')
146
+
147
+ # Save videos with updated faces
148
+ print('Saving videos DataFrame to {}'.format(videodataset_path))
149
+ df_videos.to_pickle(str(videodataset_path))
150
+
151
+ if offset > 0:
152
+ if num > 0:
153
+ if facesdataset_path.is_dir():
154
+ facesdataset_path = facesdataset_path.joinpath(
155
+ 'faces_df_from_video_{}_to_video_{}.pkl'.format(offset, num + offset))
156
+ else:
157
+ facesdataset_path = facesdataset_path.parent.joinpath(
158
+ str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}_to_video_{}.pkl'.format(offset,
159
+ num + offset))
160
+ else:
161
+ if facesdataset_path.is_dir():
162
+ facesdataset_path = facesdataset_path.joinpath('faces_df_from_video_{}.pkl'.format(offset))
163
+ else:
164
+ facesdataset_path = facesdataset_path.parent.joinpath(
165
+ str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}.pkl'.format(offset))
166
+ elif num > 0:
167
+ if facesdataset_path.is_dir():
168
+ facesdataset_path = facesdataset_path.joinpath(
169
+ 'faces_df_from_video_{}_to_video_{}.pkl'.format(0, num))
170
+ else:
171
+ facesdataset_path = facesdataset_path.parent.joinpath(
172
+ str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}_to_video_{}.pkl'.format(0, num))
173
+ else:
174
+ if facesdataset_path.is_dir():
175
+ facesdataset_path = facesdataset_path.joinpath('faces_df.pkl') # just a check if the path is a dir
176
+
177
+ # Creates directory (if doesn't exist)
178
+ facesdataset_path.parent.mkdir(parents=True, exist_ok=True)
179
+ print('Saving faces DataFrame to {}'.format(facesdataset_path))
180
+ df_faces = pd.concat(faces_dataset, axis=0, )
181
+ df_faces['video'] = df_faces['video'].astype('category')
182
+ for key in ['kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x',
183
+ 'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left',
184
+ 'top', 'right', 'bottom', ]:
185
+ df_faces[key] = df_faces[key].astype(np.int16)
186
+ df_faces['videosubject'] = df_faces['videosubject'].astype(np.int8)
187
+ # Eventually remove duplicates
188
+ df_faces = df_faces.loc[~df_faces.index.duplicated(keep='first')]
189
+ fields_to_preserve_from_video = [i for i in
190
+ ['folder', 'subject', 'scene', 'cluster', 'nfaces', 'test'] if
191
+ i in df_videos]
192
+ df_faces = pd.merge(df_faces, df_videos[fields_to_preserve_from_video], left_on='video',
193
+ right_index=True)
194
+ df_faces.to_pickle(str(facesdataset_path))
195
+
196
+ print('Completed!')
197
+
198
+
199
+ def save_jpg(args: Tuple[Image.Image, Path or str]):
200
+ image, path = args
201
+ image.save(path, quality=95, subsampling='4:4:4')
202
+
203
+
204
+ def process_video(item: Tuple[pd.Index, pd.Series],
205
+ source_dir: Path,
206
+ facedestination_dir: Path,
207
+ checkpoint_folder: Path,
208
+ face_size: int,
209
+ face_extractor: FaceExtractor,
210
+ lazycheck: bool = False,
211
+ deepcheck: bool = False,
212
+ ) -> (pd.DataFrame, Path, List[Tuple[Image.Image, Path]]) or None:
213
+ # Instatiate Index and Series
214
+ idx, record = item
215
+
216
+ # Checkpoint
217
+ video_faces_checkpoint_path = checkpoint_folder.joinpath(record['path']).with_suffix('.faces.pkl')
218
+
219
+ if not lazycheck:
220
+ if video_faces_checkpoint_path.exists():
221
+ try:
222
+ df_video_faces = pd.read_pickle(str(video_faces_checkpoint_path))
223
+ for _, r in df_video_faces.iterrows():
224
+ face_path = facedestination_dir.joinpath(r.name)
225
+ assert (face_path.exists())
226
+ if deepcheck:
227
+ img = Image.open(face_path)
228
+ img_arr = np.asarray(img)
229
+ assert (img_arr.ndim == 3)
230
+ assert (np.prod(img_arr.shape) > 0)
231
+ except Exception as e:
232
+ print('Error while checking: {}'.format(video_faces_checkpoint_path))
233
+ print(e)
234
+ video_faces_checkpoint_path.unlink()
235
+
236
+ if not (video_faces_checkpoint_path.exists()):
237
+
238
+ try:
239
+
240
+ video_face_dict_list = []
241
+
242
+ # Load faces
243
+ current_video_path = source_dir.joinpath(record['path'])
244
+ if not current_video_path.exists():
245
+ raise FileNotFoundError(f'Unable to find {current_video_path}.'
246
+ f'Are you sure that {source_dir} is the correct source directory for the video '
247
+ f'you indexed in the dataframe?')
248
+
249
+ frames = face_extractor.process_video(current_video_path)
250
+
251
+ if len(frames) == 0:
252
+ return
253
+
254
+ face_extractor.keep_only_best_face(frames)
255
+ for frame_idx, frame in enumerate(frames):
256
+ frames[frame_idx]['subjects'] = [0] * len(frames[frame_idx]['detections'])
257
+
258
+ # Extract and save faces, bounding boxes, keypoints
259
+ images_to_save: List[Tuple[Image.Image, Path]] = []
260
+ for frame_idx, frame in enumerate(frames):
261
+ if len(frames[frame_idx]['detections']):
262
+ fullframe = Image.fromarray(frames[frame_idx]['frame'])
263
+
264
+ # Preserve the only found face even if not a good one, otherwise preserve only clusters > -1
265
+ subjects = np.unique(frames[frame_idx]['subjects'])
266
+ if len(subjects) > 1:
267
+ subjects = np.asarray([s for s in subjects if s > -1])
268
+
269
+ for face_idx, _ in enumerate(frame['faces']):
270
+ subj_id = frames[frame_idx]['subjects'][face_idx]
271
+ if subj_id in subjects: # Exclude outliers if other faces detected
272
+ face_path = facedestination_dir.joinpath(record['path'], 'fr{:03d}_subj{:1d}.jpg'.format(
273
+ frames[frame_idx]['frame_idx'], subj_id))
274
+
275
+ face_dict = {'facepath': str(face_path.relative_to(facedestination_dir)), 'video': idx,
276
+ 'label': record['label'], 'videosubject': subj_id,
277
+ 'original': record['original']}
278
+ # add attibutes for ff++
279
+ if 'class' in record.keys():
280
+ face_dict.update({'class': record['class']})
281
+ if 'source' in record.keys():
282
+ face_dict.update({'source': record['source']})
283
+ if 'quality' in record.keys():
284
+ face_dict.update({'quality': record['quality']})
285
+
286
+ for field_idx, key in enumerate(blazeface.BlazeFace.detection_keys):
287
+ face_dict[key] = frames[frame_idx]['detections'][face_idx][field_idx]
288
+
289
+ cropping_bb = adapt_bb(frame_height=fullframe.height,
290
+ frame_width=fullframe.width,
291
+ bb_height=face_size,
292
+ bb_width=face_size,
293
+ left=face_dict['xmin'],
294
+ top=face_dict['ymin'],
295
+ right=face_dict['xmax'],
296
+ bottom=face_dict['ymax'])
297
+ face = fullframe.crop(cropping_bb)
298
+
299
+ for key in blazeface.BlazeFace.detection_keys:
300
+ if (key[0] == 'k' and key[-1] == 'x') or (key[0] == 'x'):
301
+ face_dict[key] -= cropping_bb[0]
302
+ elif (key[0] == 'k' and key[-1] == 'y') or (key[0] == 'y'):
303
+ face_dict[key] -= cropping_bb[1]
304
+
305
+ face_dict['left'] = face_dict.pop('xmin')
306
+ face_dict['top'] = face_dict.pop('ymin')
307
+ face_dict['right'] = face_dict.pop('xmax')
308
+ face_dict['bottom'] = face_dict.pop('ymax')
309
+
310
+ face_path.parent.mkdir(parents=True, exist_ok=True)
311
+ images_to_save.append((face, face_path))
312
+
313
+ video_face_dict_list.append(face_dict)
314
+
315
+ if len(video_face_dict_list) > 0:
316
+
317
+ df_video_faces = pd.DataFrame(video_face_dict_list)
318
+ df_video_faces.index = df_video_faces['facepath']
319
+ del df_video_faces['facepath']
320
+
321
+ # type conversions
322
+ for key in ['kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y',
323
+ 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left', 'top',
324
+ 'right', 'bottom']:
325
+ df_video_faces[key] = df_video_faces[key].astype(np.int16)
326
+ df_video_faces['conf'] = df_video_faces['conf'].astype(np.float32)
327
+ df_video_faces['video'] = df_video_faces['video'].astype('category')
328
+
329
+ video_faces_checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
330
+
331
+ else:
332
+ print('No faces extracted for video {}'.format(record['path']))
333
+ df_video_faces = pd.DataFrame()
334
+
335
+ return df_video_faces, video_faces_checkpoint_path, images_to_save
336
+
337
+ except Exception as e:
338
+ print('Error while processing: {}'.format(record['path']))
339
+ print("-" * 60)
340
+ traceback.print_exc(file=sys.stdout, limit=5)
341
+ print("-" * 60)
342
+ return
343
+
344
+
345
+ if __name__ == '__main__':
346
+ main(sys.argv[1:])
models/icpr2020dfdc/index_celebdf.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Index Celeb-DF v2
3
+ Image and Sound Processing Lab - Politecnico di Milano
4
+ Nicolò Bonettini
5
+ Edoardo Daniele Cannas
6
+ Sara Mandelli
7
+ Luca Bondi
8
+ Paolo Bestagini
9
+ """
10
+ import argparse
11
+ from multiprocessing import Pool
12
+ from pathlib import Path
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ from isplutils.utils import extract_meta_av, extract_meta_cv
18
+
19
+
20
+ def main():
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument('--source', type=Path, help='Source dir',
23
+ required=True)
24
+ parser.add_argument('--videodataset', type=Path, default='data/celebdf_videos.pkl',
25
+ help='Path to save the videos DataFrame')
26
+
27
+ args = parser.parse_args()
28
+
29
+ ## Parameters parsing
30
+ source_dir: Path = args.source
31
+ videodataset_path: Path = args.videodataset
32
+
33
+ # Create ouput folder (if doesn't exist)
34
+ videodataset_path.parent.mkdir(parents=True, exist_ok=True)
35
+
36
+ ## DataFrame
37
+ if videodataset_path.exists():
38
+ print('Loading video DataFrame')
39
+ df_videos = pd.read_pickle(videodataset_path)
40
+ else:
41
+ print('Creating video DataFrame')
42
+
43
+ split_file = Path(source_dir).joinpath('List_of_testing_videos.txt')
44
+ if not split_file.exists():
45
+ raise FileNotFoundError('Unable to find "List_of_testing_videos.txt" in {}'.format(source_dir))
46
+ test_videos_df = pd.read_csv(split_file, delimiter=' ', header=0, index_col=1)
47
+
48
+ ff_videos = Path(source_dir).rglob('*.mp4')
49
+ df_videos = pd.DataFrame(
50
+ {'path': [f.relative_to(source_dir) for f in ff_videos]})
51
+
52
+ df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
53
+ with Pool() as p:
54
+ meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
55
+ meta = np.stack(meta)
56
+ df_videos.loc[:, ['height', 'width', 'frames']] = meta
57
+
58
+ # Fix for videos that av cannot decode properly
59
+ for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
60
+ meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
61
+ df_videos.loc[idx, ['height', 'width', 'frames']] = meta
62
+
63
+ df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
64
+ df_videos['label'] = df_videos['class'].map(
65
+ lambda x: True if x == 'Celeb-synthesis' else False) # True is FAKE, False is REAL
66
+ df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').name)
67
+
68
+ df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
69
+ df_videos.loc[(df_videos['label'] == True), 'original'] = \
70
+ df_videos[(df_videos['label'] == True)]['name'].map(
71
+ lambda x: df_videos.index[
72
+ np.flatnonzero(df_videos['name'] == '_'.join([x.split('_')[0], x.split('_')[2]]))[0]]
73
+ )
74
+
75
+ df_videos['test'] = df_videos['path'].map(str).isin(test_videos_df.index)
76
+
77
+ print('Saving video DataFrame to {}'.format(videodataset_path))
78
+ df_videos.to_pickle(str(videodataset_path))
79
+
80
+ print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
81
+ print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
82
+
83
+
84
+ if __name__ == '__main__':
85
+ main()
models/icpr2020dfdc/index_dfdc.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Index the official Kaggle training dataset and prepares a train and validation set based on folders
3
+
4
+ Video Face Manipulation Detection Through Ensemble of CNNs
5
+
6
+ Image and Sound Processing Lab - Politecnico di Milano
7
+
8
+ Nicolò Bonettini
9
+ Edoardo Daniele Cannas
10
+ Sara Mandelli
11
+ Luca Bondi
12
+ Paolo Bestagini
13
+ """
14
+ import sys
15
+ import argparse
16
+ from multiprocessing import Pool
17
+ from pathlib import Path
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ from tqdm import tqdm
22
+
23
+ from isplutils.utils import extract_meta_av
24
+
25
+
26
+ def parse_args(argv):
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument('--source', type=Path, help='Source dir', required=True)
29
+ parser.add_argument('--videodataset', type=Path, default='data/dfdc_videos.pkl',
30
+ help='Path to save the videos DataFrame')
31
+ parser.add_argument('--batch', type=int, help='Batch size', default=64)
32
+
33
+ return parser.parse_args(argv)
34
+
35
+
36
+ def main(argv):
37
+ ## Parameters parsing
38
+ args = parse_args(argv)
39
+ source_dir: Path = args.source
40
+ videodataset_path: Path = args.videodataset
41
+ batch_size: int = args.batch
42
+
43
+ ## DataFrame
44
+ if videodataset_path.exists():
45
+ print('Loading video DataFrame')
46
+ df_videos = pd.read_pickle(videodataset_path)
47
+ else:
48
+ print('Creating video DataFrame')
49
+
50
+ # Create ouptut folder
51
+ videodataset_path.parent.mkdir(parents=True, exist_ok=True)
52
+
53
+ # Index
54
+ df_train_list = list()
55
+ for idx, json_path in enumerate(tqdm(sorted(source_dir.rglob('metadata.json')), desc='Indexing')):
56
+ df_tmp = pd.read_json(json_path, orient='index')
57
+ df_tmp['path'] = df_tmp.index.map(
58
+ lambda x: str(json_path.parent.relative_to(source_dir).joinpath(x)))
59
+ df_tmp['folder'] = int(str(json_path.parts[-2]).split('_')[-1])
60
+ df_train_list.append(df_tmp)
61
+ df_videos = pd.concat(df_train_list, axis=0, verify_integrity=True)
62
+
63
+ # Save space
64
+ del df_videos['split']
65
+ df_videos['label'] = df_videos['label'] == 'FAKE'
66
+ df_videos['original'] = df_videos['original'].astype('category')
67
+ df_videos['folder'] = df_videos['folder'].astype(np.uint8)
68
+
69
+ # Collect metadata
70
+ paths_arr = np.asarray(df_videos.path.map(lambda x: str(source_dir.joinpath(x))))
71
+ height_list = []
72
+ width_list = []
73
+ frames_list = []
74
+ with Pool() as pool:
75
+ for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos), step=batch_size), desc='Metadata'):
76
+ batch_res = pool.map(extract_meta_av, paths_arr[batch_idx0:batch_idx0 + batch_size])
77
+ for res in batch_res:
78
+ height_list.append(res[0])
79
+ width_list.append(res[1])
80
+ frames_list.append(res[2])
81
+
82
+ df_videos['height'] = np.asarray(height_list, dtype=np.uint16)
83
+ df_videos['width'] = np.asarray(width_list, dtype=np.uint16)
84
+ df_videos['frames'] = np.asarray(frames_list, dtype=np.uint16)
85
+
86
+ print('Saving video DataFrame to {}'.format(videodataset_path))
87
+ df_videos.to_pickle(str(videodataset_path))
88
+
89
+ print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
90
+ print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
91
+
92
+
93
+ if __name__ == '__main__':
94
+ main(sys.argv[1:])
models/icpr2020dfdc/index_ffpp.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Index FaceForensics++
3
+
4
+ Video Face Manipulation Detection Through Ensemble of CNNs
5
+
6
+ Image and Sound Processing Lab - Politecnico di Milano
7
+
8
+ Nicolò Bonettini
9
+ Edoardo Daniele Cannas
10
+ Sara Mandelli
11
+ Luca Bondi
12
+ Paolo Bestagini
13
+ """
14
+ import argparse
15
+ import sys
16
+ from multiprocessing import Pool
17
+ from pathlib import Path
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from isplutils.utils import extract_meta_av, extract_meta_cv
23
+
24
+
25
+ def parse_args(argv):
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument('--source', type=Path, help='Source dir',
28
+ default='dataset/ffpp/faceforensics')
29
+ parser.add_argument('--videodataset', type=Path, default='data/ffpp_videos.pkl',
30
+ help='Path to save the videos DataFrame')
31
+
32
+ return parser.parse_args(argv)
33
+
34
+
35
+ def main(argv):
36
+ ## Parameters parsing
37
+ args = parse_args(argv)
38
+ source_dir: Path = args.source
39
+ videodataset_path: Path = args.videodataset
40
+
41
+ # Create ouput folder (if doesn't exist)
42
+ videodataset_path.parent.mkdir(parents=True, exist_ok=True)
43
+
44
+ ## DataFrame
45
+ if videodataset_path.exists():
46
+ print('Loading video DataFrame')
47
+ df_videos = pd.read_pickle(videodataset_path)
48
+ else:
49
+ print('Creating video DataFrame')
50
+
51
+ ff_videos = Path(source_dir).rglob('*.mp4')
52
+ df_videos = pd.DataFrame(
53
+ {'path': [f.relative_to(source_dir) for f in ff_videos if 'mask' not in str(f) and 'raw' not in str(f)]})
54
+
55
+ df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
56
+ with Pool() as p:
57
+ meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
58
+ meta = np.stack(meta)
59
+ df_videos.loc[:, ['height', 'width', 'frames']] = meta
60
+
61
+ # Fix for videos that av cannot decode properly
62
+ for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
63
+ meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
64
+ df_videos.loc[idx, ['height', 'width', 'frames']] = meta
65
+
66
+ df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
67
+ df_videos['label'] = df_videos['class'].map(
68
+ lambda x: True if x == 'manipulated_sequences' else False) # True is FAKE, False is REAL
69
+ df_videos['source'] = df_videos['path'].map(lambda x: x.parts[1]).astype('category')
70
+ df_videos['quality'] = df_videos['path'].map(lambda x: x.parts[2]).astype('category')
71
+ df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').parts[-1])
72
+
73
+ df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
74
+ df_videos.loc[(df_videos['label'] == True) & (df_videos['source'] != 'DeepFakeDetection'), 'original'] = \
75
+ df_videos[(df_videos['label'] == True) & (df_videos['source'] != 'DeepFakeDetection')]['name'].map(
76
+ lambda x: df_videos.index[np.flatnonzero(df_videos['name'] == x.split('_')[0])[0]]
77
+ )
78
+ df_videos.loc[(df_videos['label'] == True) & (df_videos['source'] == 'DeepFakeDetection'), 'original'] = \
79
+ df_videos[(df_videos['label'] == True) & (df_videos['source'] == 'DeepFakeDetection')]['name'].map(
80
+ lambda x: df_videos.index[
81
+ np.flatnonzero(df_videos['name'] == x.split('_')[0] + '__' + x.split('__')[1])[0]]
82
+ )
83
+
84
+ print('Saving video DataFrame to {}'.format(videodataset_path))
85
+ df_videos.to_pickle(str(videodataset_path))
86
+
87
+ print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
88
+ print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
89
+
90
+
91
+ if __name__ == '__main__':
92
+ main(sys.argv[1:])
models/icpr2020dfdc/isplutils/__init__.py ADDED
File without changes
models/icpr2020dfdc/isplutils/data.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+ import os
13
+ from pathlib import Path
14
+ from typing import List
15
+
16
+ import albumentations as A
17
+ import numpy as np
18
+ import pandas as pd
19
+ import torch
20
+ from PIL import Image
21
+ from albumentations.pytorch import ToTensorV2
22
+ from torch.utils.data import Dataset, IterableDataset
23
+
24
+ from .utils import extract_bb
25
+
26
+
27
+ def load_face(record: pd.Series, root: str, size: int, scale: str, transformer: A.BasicTransform) -> torch.Tensor:
28
+ path = os.path.join(str(root), str(record.name))
29
+ autocache = size < 256 or scale == 'tight'
30
+ if scale in ['crop', 'scale', ]:
31
+ cached_path = str(Path(root).joinpath('autocache', scale, str(size), str(record.name)).with_suffix('.jpg'))
32
+ else:
33
+ # when self.scale == 'tight' the extracted face is not dependent on size
34
+ cached_path = str(Path(root).joinpath('autocache', scale, str(record.name)).with_suffix('.jpg'))
35
+
36
+ face = np.zeros((size, size, 3), dtype=np.uint8)
37
+ if os.path.exists(cached_path):
38
+ try:
39
+ face = Image.open(cached_path)
40
+ face = np.array(face)
41
+ if len(face.shape) != 3:
42
+ raise RuntimeError('Incorrect format: {}'.format(path))
43
+ except KeyboardInterrupt as e:
44
+ # We want keybord interrupts to be propagated
45
+ raise e
46
+ except (OSError, IOError) as e:
47
+ print('Deleting corrupted cache file: {}'.format(cached_path))
48
+ print(e)
49
+ os.unlink(cached_path)
50
+ face = np.zeros((size, size, 3), dtype=np.uint8)
51
+
52
+ if not os.path.exists(cached_path):
53
+ try:
54
+ frame = Image.open(path)
55
+ bb = record['left'], record['top'], record['right'], record['bottom']
56
+ face = extract_bb(frame, bb=bb, size=size, scale=scale)
57
+
58
+ if autocache:
59
+ os.makedirs(os.path.dirname(cached_path), exist_ok=True)
60
+ face.save(cached_path, quality=95, subsampling='4:4:4')
61
+
62
+ face = np.array(face)
63
+ if len(face.shape) != 3:
64
+ raise RuntimeError('Incorrect format: {}'.format(path))
65
+ except KeyboardInterrupt as e:
66
+ # We want keybord interrupts to be propagated
67
+ raise e
68
+ except (OSError, IOError) as e:
69
+ print('Error while reading: {}'.format(path))
70
+ print(e)
71
+ face = np.zeros((size, size, 3), dtype=np.uint8)
72
+
73
+ face = transformer(image=face)['image']
74
+
75
+ return face
76
+
77
+
78
+ class FrameFaceIterableDataset(IterableDataset):
79
+
80
+ def __init__(self,
81
+ roots: List[str],
82
+ dfs: List[pd.DataFrame],
83
+ size: int, scale: str,
84
+ num_samples: int = -1,
85
+ transformer: A.BasicTransform = ToTensorV2(),
86
+ output_index: bool = False,
87
+ labels_map: dict = None,
88
+ seed: int = None):
89
+ """
90
+
91
+ :param roots: List of root folders for frames cache
92
+ :param dfs: List of DataFrames of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
93
+ and 'label' column
94
+ :param size: face size
95
+ :param num_samples:
96
+ :param scale: Rescale the face to the given size, preserving the aspect ratio.
97
+ If false crop around center to the given size
98
+ :param transformer:
99
+ :param output_index: enable output of df_frames index
100
+ :param labels_map: map from 'REAL' and 'FAKE' to actual labels
101
+ """
102
+
103
+ self.dfs = dfs
104
+ self.size = int(size)
105
+
106
+ self.seed0 = int(seed) if seed is not None else np.random.choice(2 ** 32)
107
+
108
+ # adapt indices
109
+ dfs_adapted = [df.copy() for df in self.dfs]
110
+ for df_idx, df in enumerate(dfs_adapted):
111
+ mi = pd.MultiIndex.from_tuples([(df_idx, key) for key in df.index], names=['df_idx', 'df_key'])
112
+ df.index = mi
113
+ # Concat
114
+ self.df = pd.concat(dfs_adapted, axis=0, join='inner')
115
+
116
+ self.df_real = self.df[self.df['label'] == 0]
117
+ self.df_fake = self.df[self.df['label'] == 1]
118
+
119
+ self.longer_set = 'real' if len(self.df_real) > len(self.df_fake) else 'fake'
120
+ self.num_samples = max(len(self.df_real), len(self.df_fake)) * 2
121
+ self.num_samples = min(self.num_samples, num_samples) if num_samples > 0 else self.num_samples
122
+
123
+ self.output_idx = bool(output_index)
124
+
125
+ self.scale = str(scale)
126
+ self.roots = [str(r) for r in roots]
127
+ self.transformer = transformer
128
+
129
+ self.labels_map = labels_map
130
+ if self.labels_map is None:
131
+ self.labels_map = {False: np.array([0., ]), True: np.array([1., ])}
132
+ else:
133
+ self.labels_map = dict(self.labels_map)
134
+
135
+ def _get_face(self, item: pd.Index) -> (torch.Tensor, torch.Tensor) or (torch.Tensor, torch.Tensor, str):
136
+
137
+ record = self.dfs[item[0]].loc[item[1]]
138
+ face = load_face(record=record,
139
+ root=self.roots[item[0]],
140
+ size=self.size,
141
+ scale=self.scale,
142
+ transformer=self.transformer)
143
+
144
+ label = self.labels_map[record.label]
145
+ if self.output_idx:
146
+ return face, label, record.name
147
+ else:
148
+ return face, label
149
+
150
+ def __len__(self):
151
+ return self.num_samples
152
+
153
+ def __iter__(self):
154
+
155
+ random_fake_idxs, random_real_idxs = get_iterative_real_fake_idxs(
156
+ df_real=self.df_real,
157
+ df_fake=self.df_fake,
158
+ num_samples=self.num_samples,
159
+ seed0=self.seed0
160
+ )
161
+
162
+ while len(random_fake_idxs) >= 1 and len(random_real_idxs) >= 1:
163
+ yield self._get_face(random_fake_idxs.pop())
164
+ yield self._get_face(random_real_idxs.pop())
165
+
166
+
167
+ def get_iterative_real_fake_idxs(df_real: pd.DataFrame, df_fake: pd.DataFrame,
168
+ num_samples: int, seed0: int):
169
+ longer_set = 'real' if len(df_real) > len(df_fake) else 'fake'
170
+ worker_info = torch.utils.data.get_worker_info()
171
+ if worker_info is None:
172
+ seed = seed0
173
+ np.random.seed(seed)
174
+ worker_num_couple_samples = num_samples // 2
175
+ fake_idxs_portion = np.random.choice(df_fake.index, worker_num_couple_samples,
176
+ replace=longer_set == 'real')
177
+ real_idxs_portion = np.random.choice(df_real.index, worker_num_couple_samples,
178
+ replace=longer_set == 'fake')
179
+ else:
180
+ worker_id = worker_info.id
181
+ seed = seed0 + worker_id
182
+ np.random.seed(seed)
183
+ worker_num_couple_samples = (num_samples // 2) // worker_info.num_workers
184
+ if longer_set == 'fake':
185
+ fake_idxs_portion = df_fake.index[
186
+ worker_id * worker_num_couple_samples:(worker_id + 1) * worker_num_couple_samples]
187
+ real_idxs_portion = np.random.choice(df_real.index, worker_num_couple_samples, replace=True)
188
+ else:
189
+ real_idxs_portion = df_real.index[
190
+ worker_id * worker_num_couple_samples:(worker_id + 1) * worker_num_couple_samples]
191
+ fake_idxs_portion = np.random.choice(df_fake.index, worker_num_couple_samples,
192
+ replace=True)
193
+ random_fake_idxs = list(np.random.permutation(fake_idxs_portion))
194
+ random_real_idxs = list(np.random.permutation(real_idxs_portion))
195
+
196
+ assert (len(random_fake_idxs) == len(random_real_idxs))
197
+
198
+ return random_fake_idxs, random_real_idxs
199
+
200
+
201
+ class FrameFaceDatasetTest(Dataset):
202
+
203
+ def __init__(self, root: str, df: pd.DataFrame,
204
+ size: int, scale: str,
205
+ transformer: A.BasicTransform = ToTensorV2(),
206
+ labels_map: dict = None,
207
+ aug_transformers: List[A.BasicTransform] = None):
208
+ """
209
+
210
+ :param root: root folder for frames cache
211
+ :param df: DataFrame of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
212
+ and 'label' column
213
+ :param size: face size
214
+ :param num_samples:
215
+ :param scale: Rescale the face to the given size, preserving the aspect ratio.
216
+ If false crop around center to the given size
217
+ :param transformer:
218
+ :param labels_map: dcit to map df labels
219
+ :param aug_transformers: if not None, creates multiple copies of the same sample according to the provided augmentations
220
+ """
221
+
222
+ self.df = df
223
+ self.size = int(size)
224
+
225
+ self.scale = str(scale)
226
+ self.root = str(root)
227
+ self.transformer = transformer
228
+ self.aug_transformers = aug_transformers
229
+
230
+ self.labels_map = labels_map
231
+ if self.labels_map is None:
232
+ self.labels_map = {False: np.array([0., ]), True: np.array([1., ])}
233
+ else:
234
+ self.labels_map = dict(self.labels_map)
235
+
236
+ def _get_face(self, item: pd.Index) -> (torch.Tensor, torch.Tensor) or (torch.Tensor, torch.Tensor, str):
237
+ record = self.df.loc[item]
238
+ label = self.labels_map[record.label]
239
+ if self.aug_transformers is None:
240
+ face = load_face(record=record,
241
+ root=self.root,
242
+ size=self.size,
243
+ scale=self.scale,
244
+ transformer=self.transformer)
245
+ return face, label
246
+ else:
247
+ faces = []
248
+ for aug_transf in self.aug_transformers:
249
+ faces.append(
250
+ load_face(record=record,
251
+ root=self.root,
252
+ size=self.size,
253
+ scale=self.scale,
254
+ transformer=A.Compose([aug_transf, self.transformer])
255
+ ))
256
+ faces = torch.stack(faces)
257
+ return faces, label
258
+
259
+ def __len__(self):
260
+ return len(self.df)
261
+
262
+ def __getitem__(self, item):
263
+ return self._get_face(self.df.index[item])
models/icpr2020dfdc/isplutils/data_siamese.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+ from typing import List
13
+
14
+ import albumentations as A
15
+ import pandas as pd
16
+ from albumentations.pytorch import ToTensorV2
17
+
18
+ from .data import FrameFaceIterableDataset, get_iterative_real_fake_idxs
19
+
20
+
21
+ class FrameFaceTripletIterableDataset(FrameFaceIterableDataset):
22
+
23
+ def __init__(self,
24
+ roots: List[str],
25
+ dfs: List[pd.DataFrame],
26
+ size: int,
27
+ scale: str,
28
+ num_triplets: int = -1,
29
+ transformer: A.BasicTransform = ToTensorV2(),
30
+ seed: int = None):
31
+ """
32
+
33
+ :param roots: List of root folders for frames cache
34
+ :param dfs: List of DataFrames of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
35
+ and 'label' column
36
+ :param size: face size
37
+ :param num_triplets: number of samples for the dataset
38
+ :param idxs: sampling indexes triplets (each element is a key for anchor, positive, negative)
39
+ :param scale: Rescale the face to the given size, preserving the aspect ratio.
40
+ If false crop around center to the given size
41
+ :param transformer:
42
+ :param seed:
43
+ """
44
+ super(FrameFaceTripletIterableDataset, self).__init__(
45
+ roots=roots,
46
+ dfs=dfs,
47
+ size=size,
48
+ scale=scale,
49
+ num_samples=num_triplets * 3,
50
+ transformer=transformer,
51
+ seed=seed
52
+ )
53
+
54
+ self.num_triplet_couples = self.num_samples // 6
55
+ self.num_triplets = self.num_triplet_couples * 2
56
+ self.num_samples = self.num_triplets * 3
57
+
58
+ def __len__(self):
59
+ return self.num_triplets
60
+
61
+ def __iter__(self):
62
+ random_fake_idxs, random_real_idxs = get_iterative_real_fake_idxs(
63
+ df_real=self.df_real,
64
+ df_fake=self.df_fake,
65
+ num_samples=self.num_samples,
66
+ seed0=self.seed0
67
+ )
68
+
69
+ while len(random_fake_idxs) >= 3 and len(random_real_idxs) >= 3:
70
+ a = self._get_face(random_fake_idxs.pop())[0]
71
+ p = self._get_face(random_fake_idxs.pop())[0]
72
+ n = self._get_face(random_real_idxs.pop())[0]
73
+ yield a, p, n
74
+
75
+ a = self._get_face(random_real_idxs.pop())[0]
76
+ p = self._get_face(random_real_idxs.pop())[0]
77
+ n = self._get_face(random_fake_idxs.pop())[0]
78
+ yield a, p, n
models/icpr2020dfdc/isplutils/split.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Tuple
2
+ """
3
+ Video Face Manipulation Detection Through Ensemble of CNNs
4
+
5
+ Image and Sound Processing Lab - Politecnico di Milano
6
+
7
+ Nicolò Bonettini
8
+ Edoardo Daniele Cannas
9
+ Sara Mandelli
10
+ Luca Bondi
11
+ Paolo Bestagini
12
+ """
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ available_datasets = [
17
+ 'dfdc-35-5-10',
18
+ 'ff-c23-720-140-140',
19
+ 'ff-c23-720-140-140-5fpv',
20
+ 'ff-c23-720-140-140-10fpv',
21
+ 'ff-c23-720-140-140-15fpv',
22
+ 'ff-c23-720-140-140-20fpv',
23
+ 'ff-c23-720-140-140-25fpv',
24
+ 'celebdf', # just for convenience, not used in the original paper
25
+ ]
26
+
27
+
28
+ def load_df(dfdc_df_path: str, ffpp_df_path: str, dfdc_faces_dir: str, ffpp_faces_dir: str, dataset: str) -> (pd.DataFrame, str):
29
+ if dataset.startswith('dfdc'):
30
+ df = pd.read_pickle(dfdc_df_path)
31
+ root = dfdc_faces_dir
32
+ elif dataset.startswith('ff-'):
33
+ df = pd.read_pickle(ffpp_df_path)
34
+ root = ffpp_faces_dir
35
+ else:
36
+ raise NotImplementedError('Unknown dataset: {}'.format(dataset))
37
+ return df, root
38
+
39
+
40
+ def get_split_df(df: pd.DataFrame, dataset: str, split: str) -> pd.DataFrame:
41
+ if dataset == 'dfdc-35-5-10':
42
+ if split == 'train':
43
+ split_df = df[df['folder'].isin(range(35))]
44
+ elif split == 'val':
45
+ split_df = df[df['folder'].isin(range(35, 40))]
46
+ elif split == 'test':
47
+ split_df = df[df['folder'].isin(range(40, 50))]
48
+ else:
49
+ raise NotImplementedError('Unknown split: {}'.format(split))
50
+ elif dataset.startswith('ff-c23-720-140-140'):
51
+ # Save random state
52
+ st0 = np.random.get_state()
53
+ # Set seed for this selection only
54
+ np.random.seed(41)
55
+ # Split on original videos
56
+ crf = dataset.split('-')[1]
57
+ random_youtube_videos = np.random.permutation(
58
+ df[(df['source'] == 'youtube') & (df['quality'] == crf)]['video'].unique())
59
+ train_orig = random_youtube_videos[:720]
60
+ val_orig = random_youtube_videos[720:720 + 140]
61
+ test_orig = random_youtube_videos[720 + 140:]
62
+ if split == 'train':
63
+ split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
64
+ elif split == 'val':
65
+ split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
66
+ elif split == 'test':
67
+ split_df = pd.concat((df[df['original'].isin(test_orig)], df[df['video'].isin(test_orig)]), axis=0)
68
+ else:
69
+ raise NotImplementedError('Unknown split: {}'.format(split))
70
+
71
+ if dataset.endswith('fpv'):
72
+ fpv = int(dataset.rsplit('-', 1)[1][:-3])
73
+ idxs = []
74
+ for video in split_df['video'].unique():
75
+ idxs.append(np.random.choice(split_df[split_df['video'] == video].index, fpv, replace=False))
76
+ idxs = np.concatenate(idxs)
77
+ split_df = split_df.loc[idxs]
78
+ # Restore random state
79
+ np.random.set_state(st0)
80
+ elif dataset == 'celebdf':
81
+
82
+ seed = 41
83
+ num_real_train = 600
84
+
85
+ # Save random state
86
+ st0 = np.random.get_state()
87
+ # Set seed for this selection only
88
+ np.random.seed(seed)
89
+ # Split on original videos
90
+ random_train_val_real_videos = np.random.permutation(
91
+ df[(df['label'] == False) & (df['test'] == False)]['video'].unique())
92
+ train_orig = random_train_val_real_videos[:num_real_train]
93
+ val_orig = random_train_val_real_videos[num_real_train:]
94
+ if split == 'train':
95
+ split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
96
+ elif split == 'val':
97
+ split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
98
+ elif split == 'test':
99
+ split_df = df[df['test'] == True]
100
+ else:
101
+ raise NotImplementedError('Unknown split: {}'.format(split))
102
+ # Restore random state
103
+ np.random.set_state(st0)
104
+ else:
105
+ raise NotImplementedError('Unknown dataset: {}'.format(dataset))
106
+ return split_df
107
+
108
+
109
+ def make_splits(dfdc_df: str, ffpp_df: str, dfdc_dir: str, ffpp_dir: str, dbs: Dict[str, List[str]]) -> Dict[str, Dict[str, Tuple[pd.DataFrame, str]]]:
110
+ """
111
+ Make split and return Dataframe and root
112
+ :param
113
+ dfdc_df: str, path to the DataFrame containing info on the faces extracted from the DFDC dataset with extract_faces.py
114
+ ffpp_df: str, path to the DataFrame containing info on the faces extracted from the FF++ dataset with extract_faces.py
115
+ dfdc_dir: str, path to the directory containing the faces extracted from the DFDC dataset with extract_faces.py
116
+ ffpp_dir: str, path to the directory containing the faces extracted from the FF++ dataset with extract_faces.py
117
+ dbs: {split_name:[split_dataset1,split_dataset2,...]}
118
+ Example:
119
+ {'train':['dfdc-35-5-15',],'val':['dfdc-35-5-15',]}
120
+ :return: split_dict: dictonary containing {split_name: ['train', 'val'], splitdb: List(pandas.DataFrame, str)}
121
+ Example:
122
+ {'train, 'dfdc-35-5-15': (dfdc_train_df, 'path/to/dir/of/DFDC/faces')}
123
+ """
124
+ split_dict = {}
125
+ full_dfs = {}
126
+ for split_name, split_dbs in dbs.items():
127
+ split_dict[split_name] = dict()
128
+ for split_db in split_dbs:
129
+ if split_db not in full_dfs:
130
+ full_dfs[split_db] = load_df(dfdc_df, ffpp_df, dfdc_dir, ffpp_dir, split_db)
131
+ full_df, root = full_dfs[split_db]
132
+ split_df = get_split_df(df=full_df, dataset=split_db, split=split_name)
133
+ split_dict[split_name][split_db] = (split_df, root)
134
+
135
+ return split_dict
models/icpr2020dfdc/isplutils/utils.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+ from pprint import pprint
13
+ from typing import Iterable, List
14
+
15
+ import albumentations as A
16
+ import cv2
17
+ import numpy as np
18
+ import scipy
19
+ import torch
20
+ from PIL import Image
21
+ from albumentations.pytorch import ToTensorV2
22
+ from matplotlib import pyplot as plt
23
+ from torch import nn as nn
24
+ from torchvision import transforms
25
+
26
+
27
+ def extract_meta_av(path: str) -> (int, int, int):
28
+ """
29
+ Extract video height, width and number of frames to index the files
30
+ :param path:
31
+ :return:
32
+ """
33
+ import av
34
+ try:
35
+ video = av.open(path)
36
+ video_stream = video.streams.video[0]
37
+ return video_stream.height, video_stream.width, video_stream.frames
38
+ except av.AVError as e:
39
+ print('Error while reading file: {}'.format(path))
40
+ print(e)
41
+ return 0, 0, 0
42
+ except IndexError as e:
43
+ print('Error while processing file: {}'.format(path))
44
+ print(e)
45
+ return 0, 0, 0
46
+
47
+
48
+ def extract_meta_cv(path: str) -> (int, int, int):
49
+ """
50
+ Extract video height, width and number of frames to index the files
51
+ :param path:
52
+ :return:
53
+ """
54
+ try:
55
+ vid = cv2.VideoCapture(path)
56
+ num_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
57
+ height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
58
+ width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
59
+ return height, width, num_frames
60
+ except Exception as e:
61
+ print('Error while reading file: {}'.format(path))
62
+ print(e)
63
+ return 0, 0, 0
64
+
65
+
66
+ def adapt_bb(frame_height: int, frame_width: int, bb_height: int, bb_width: int, left: int, top: int, right: int,
67
+ bottom: int) -> (
68
+ int, int, int, int):
69
+ x_ctr = (left + right) // 2
70
+ y_ctr = (bottom + top) // 2
71
+ new_top = max(y_ctr - bb_height // 2, 0)
72
+ new_bottom = min(new_top + bb_height, frame_height)
73
+ new_left = max(x_ctr - bb_width // 2, 0)
74
+ new_right = min(new_left + bb_width, frame_width)
75
+ return new_left, new_top, new_right, new_bottom
76
+
77
+
78
+ def extract_bb(frame: Image.Image, bb: Iterable, scale: str, size: int) -> Image.Image:
79
+ """
80
+ Extract a face from a frame according to the given bounding box and scale policy
81
+ :param frame: Entire frame
82
+ :param bb: Bounding box (left,top,right,bottom) in the reference system of the frame
83
+ :param scale: "scale" to crop a square with size equal to the maximum between height and width of the face, then scale to size
84
+ "crop" to crop a fixed square around face center,
85
+ "tight" to crop face exactly at the bounding box with no scaling
86
+ :param size: size of the face
87
+ :return:
88
+ """
89
+ left, top, right, bottom = bb
90
+ if scale == "scale":
91
+ bb_width = int(right) - int(left)
92
+ bb_height = int(bottom) - int(top)
93
+ bb_to_desired_ratio = min(size / bb_height, size / bb_width) if (bb_width > 0 and bb_height > 0) else 1.
94
+ bb_width = int(size / bb_to_desired_ratio)
95
+ bb_height = int(size / bb_to_desired_ratio)
96
+ left, top, right, bottom = adapt_bb(frame.height, frame.width, bb_height, bb_width, left, top, right,
97
+ bottom)
98
+ face = frame.crop((left, top, right, bottom)).resize((size, size), Image.BILINEAR)
99
+ elif scale == "crop":
100
+ # Find the center of the bounding box and cut an area around it of height x width
101
+ left, top, right, bottom = adapt_bb(frame.height, frame.width, size, size, left, top, right,
102
+ bottom)
103
+ face = frame.crop((left, top, right, bottom))
104
+ elif scale == "tight":
105
+ left, top, right, bottom = adapt_bb(frame.height, frame.width, bottom - top, right - left, left, top, right,
106
+ bottom)
107
+ face = frame.crop((left, top, right, bottom))
108
+ else:
109
+ raise ValueError('Unknown scale value: {}'.format(scale))
110
+
111
+ return face
112
+
113
+
114
+ def showimage(img_tensor: torch.Tensor):
115
+ topil = transforms.Compose([
116
+ transforms.Normalize(mean=[0, 0, 0, ], std=[1 / 0.229, 1 / 0.224, 1 / 0.225]),
117
+ transforms.Normalize(mean=[-0.485, -0.456, -0.406], std=[1, 1, 1]),
118
+ transforms.ToPILImage()
119
+ ])
120
+ plt.figure()
121
+ plt.imshow(topil(img_tensor))
122
+ plt.show()
123
+
124
+
125
+ def make_train_tag(net_class: nn.Module,
126
+ face_policy: str,
127
+ patch_size: int,
128
+ traindb: List[str],
129
+ seed: int,
130
+ suffix: str,
131
+ debug: bool,
132
+ ):
133
+ # Training parameters and tag
134
+ tag_params = dict(net=net_class.__name__,
135
+ traindb='-'.join(traindb),
136
+ face=face_policy,
137
+ size=patch_size,
138
+ seed=seed
139
+ )
140
+ print('Parameters')
141
+ pprint(tag_params)
142
+ tag = 'debug_' if debug else ''
143
+ tag += '_'.join(['-'.join([key, str(tag_params[key])]) for key in tag_params])
144
+ if suffix is not None:
145
+ tag += '_' + suffix
146
+ print('Tag: {:s}'.format(tag))
147
+ return tag
148
+
149
+
150
+ def get_transformer(face_policy: str, patch_size: int, net_normalizer: transforms.Normalize, train: bool):
151
+ # Transformers and traindb
152
+ if face_policy == 'scale':
153
+ # The loader crops the face isotropically then scales to a square of size patch_size_load
154
+ loading_transformations = [
155
+ A.PadIfNeeded(min_height=patch_size, min_width=patch_size,
156
+ border_mode=cv2.BORDER_CONSTANT, value=0,always_apply=True),
157
+ A.Resize(height=patch_size,width=patch_size,always_apply=True),
158
+ ]
159
+ if train:
160
+ downsample_train_transformations = [
161
+ A.Downscale(scale_max=0.5, scale_min=0.5, p=0.5), # replaces scaled dataset
162
+ ]
163
+ else:
164
+ downsample_train_transformations = []
165
+ elif face_policy == 'tight':
166
+ # The loader crops the face tightly without any scaling
167
+ loading_transformations = [
168
+ A.LongestMaxSize(max_size=patch_size, always_apply=True),
169
+ A.PadIfNeeded(min_height=patch_size, min_width=patch_size,
170
+ border_mode=cv2.BORDER_CONSTANT, value=0,always_apply=True),
171
+ ]
172
+ if train:
173
+ downsample_train_transformations = [
174
+ A.Downscale(scale_max=0.5, scale_min=0.5, p=0.5), # replaces scaled dataset
175
+ ]
176
+ else:
177
+ downsample_train_transformations = []
178
+ else:
179
+ raise ValueError('Unknown value for face_policy: {}'.format(face_policy))
180
+
181
+ if train:
182
+ aug_transformations = [
183
+ A.Compose([
184
+ A.HorizontalFlip(),
185
+ A.OneOf([
186
+ A.RandomBrightnessContrast(),
187
+ A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=30, val_shift_limit=20),
188
+ ]),
189
+ A.OneOf([
190
+ A.ISONoise(),
191
+ A.IAAAdditiveGaussianNoise(scale=(0.01 * 255, 0.03 * 255)),
192
+ ]),
193
+ A.Downscale(scale_min=0.7, scale_max=0.9, interpolation=cv2.INTER_LINEAR),
194
+ A.ImageCompression(quality_lower=50, quality_upper=99),
195
+ ], )
196
+ ]
197
+ else:
198
+ aug_transformations = []
199
+
200
+ # Common final transformations
201
+ final_transformations = [
202
+ A.Normalize(mean=net_normalizer.mean, std=net_normalizer.std, ),
203
+ ToTensorV2(),
204
+ ]
205
+ transf = A.Compose(
206
+ loading_transformations + downsample_train_transformations + aug_transformations + final_transformations)
207
+ return transf
208
+
209
+
210
+ def aggregate(x, deadzone: float, pre_mult: float, policy: str, post_mult: float, clipmargin: float, params={}):
211
+ x = x.copy()
212
+ if deadzone > 0:
213
+ x = x[(x > deadzone) | (x < -deadzone)]
214
+ if len(x) == 0:
215
+ x = np.asarray([0, ])
216
+ if policy == 'mean':
217
+ x = np.mean(x)
218
+ x = scipy.special.expit(x * pre_mult)
219
+ x = (x - 0.5) * post_mult + 0.5
220
+ elif policy == 'sigmean':
221
+ x = scipy.special.expit(x * pre_mult).mean()
222
+ x = (x - 0.5) * post_mult + 0.5
223
+ elif policy == 'meanp':
224
+ pow_coeff = params.pop('p', 3)
225
+ x = np.mean(np.sign(x) * (np.abs(x) ** pow_coeff))
226
+ x = np.sign(x) * (np.abs(x) ** (1 / pow_coeff))
227
+ x = scipy.special.expit(x * pre_mult)
228
+ x = (x - 0.5) * post_mult + 0.5
229
+ elif policy == 'median':
230
+ x = scipy.special.expit(np.median(x) * pre_mult)
231
+ x = (x - 0.5) * post_mult + 0.5
232
+ elif policy == 'sigmedian':
233
+ x = np.median(scipy.special.expit(x * pre_mult))
234
+ x = (x - 0.5) * post_mult + 0.5
235
+ elif policy == 'maxabs':
236
+ x = np.min(x) if abs(np.min(x)) > abs(np.max(x)) else np.max(x)
237
+ x = scipy.special.expit(x * pre_mult)
238
+ x = (x - 0.5) * post_mult + 0.5
239
+ elif policy == 'avgvoting':
240
+ x = np.mean(np.sign(x))
241
+ x = (x * post_mult + 1) / 2
242
+ elif policy == 'voting':
243
+ x = np.sign(np.mean(x * pre_mult))
244
+ x = (x - 0.5) * post_mult + 0.5
245
+ else:
246
+ raise NotImplementedError()
247
+ return np.clip(x, clipmargin, 1 - clipmargin)
models/icpr2020dfdc/test_model.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+ import argparse
13
+ import gc
14
+ from collections import OrderedDict
15
+ from pathlib import Path
16
+
17
+ import albumentations as A
18
+ import matplotlib.pyplot as plt
19
+ import numpy as np
20
+ import pandas as pd
21
+ import torch
22
+ import torch.nn as nn
23
+ from torch.utils.data import DataLoader
24
+ from tqdm import tqdm
25
+
26
+ from architectures import fornet
27
+ from architectures.fornet import FeatureExtractor
28
+ from isplutils import utils, split
29
+ from isplutils.data import FrameFaceDatasetTest
30
+
31
+
32
+ def main():
33
+ # Args
34
+ parser = argparse.ArgumentParser()
35
+
36
+ parser.add_argument('--testsets', type=str, help='Testing datasets', nargs='+', choices=split.available_datasets,
37
+ required=True)
38
+ parser.add_argument('--testsplits', type=str, help='Test split', nargs='+', default=['val', 'test'],
39
+ choices=['train', 'val', 'test'])
40
+ parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
41
+ help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
42
+ 'Required for training/validating on the DFDC dataset.')
43
+ parser.add_argument('--dfdc_faces_dir', type=str, action='store',
44
+ help='Path to the directory containing the faces extracted from the DFDC dataset. '
45
+ 'Required for training/validating on the DFDC dataset.')
46
+ parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
47
+ help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
48
+ 'Required for training/validating on the FF++ dataset.')
49
+ parser.add_argument('--ffpp_faces_dir', type=str, action='store',
50
+ help='Path to the directory containing the faces extracted from the FF++ dataset. '
51
+ 'Required for training/validating on the FF++ dataset.')
52
+
53
+ # Specify trained model path
54
+ parser.add_argument('--model_path', type=Path, help='Full path of the trained model', required=True)
55
+
56
+ # Common params
57
+ parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=128)
58
+
59
+ parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
60
+ parser.add_argument('--device', type=int, help='GPU id', default=0)
61
+
62
+ parser.add_argument('--debug', action='store_true', help='Debug flag', )
63
+ parser.add_argument('--num_video', type=int, help='Number of real-fake videos to test')
64
+ parser.add_argument('--results_dir', type=Path, help='Output folder',
65
+ default='results/')
66
+
67
+ parser.add_argument('--override', action='store_true', help='Override existing results', )
68
+
69
+ args = parser.parse_args()
70
+
71
+ device = torch.device('cuda:{}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
72
+ num_workers: int = args.workers
73
+ batch_size: int = args.batch
74
+ max_num_videos_per_label: int = args.num_video # number of real-fake videos to test
75
+ model_path: Path = args.model_path
76
+ results_dir: Path = args.results_dir
77
+ debug: bool = args.debug
78
+ override: bool = args.override
79
+ test_sets = args.testsets
80
+ test_splits = args.testsplits
81
+ dfdc_df_path = args.dfdc_faces_df_path
82
+ ffpp_df_path = args.ffpp_faces_df_path
83
+ dfdc_faces_dir = args.dfdc_faces_dir
84
+ ffpp_faces_dir = args.ffpp_faces_dir
85
+
86
+ # get arguments from the model path
87
+ face_policy = str(model_path).split('face-')[1].split('_')[0]
88
+ patch_size = int(str(model_path).split('size-')[1].split('_')[0])
89
+ net_name = str(model_path).split('net-')[1].split('_')[0]
90
+ model_name = '_'.join(model_path.with_suffix('').parts[-2:])
91
+
92
+ # Load net
93
+ net_class = getattr(fornet, net_name)
94
+
95
+ # load model
96
+ print('Loading model...')
97
+ state_tmp = torch.load(model_path, map_location='cpu')
98
+ if 'net' not in state_tmp.keys():
99
+ state = OrderedDict({'net': OrderedDict()})
100
+ [state['net'].update({'model.{}'.format(k): v}) for k, v in state_tmp.items()]
101
+ else:
102
+ state = state_tmp
103
+ net: FeatureExtractor = net_class().eval().to(device)
104
+
105
+ incomp_keys = net.load_state_dict(state['net'], strict=True)
106
+ print(incomp_keys)
107
+ print('Model loaded!')
108
+
109
+ # val loss per-frame
110
+ criterion = nn.BCEWithLogitsLoss(reduction='none')
111
+
112
+ # Define data transformers
113
+ test_transformer = utils.get_transformer(face_policy, patch_size, net.get_normalizer(), train=False)
114
+
115
+ # datasets and dataloaders (from train_binclass.py)
116
+ print('Loading data...')
117
+ # Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
118
+ for dataset in test_sets:
119
+ if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
120
+ raise RuntimeError('Specify DataFrame and directory for DFDC faces for testing!')
121
+ elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
122
+ raise RuntimeError('Specify DataFrame and directory for FF++ faces for testing!')
123
+ splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir,
124
+ ffpp_dir=ffpp_faces_dir, dbs={'train': test_sets, 'val': test_sets, 'test': test_sets})
125
+ train_dfs = [splits['train'][db][0] for db in splits['train']]
126
+ train_roots = [splits['train'][db][1] for db in splits['train']]
127
+ val_roots = [splits['val'][db][1] for db in splits['val']]
128
+ val_dfs = [splits['val'][db][0] for db in splits['val']]
129
+ test_dfs = [splits['test'][db][0] for db in splits['test']]
130
+ test_roots = [splits['test'][db][1] for db in splits['test']]
131
+
132
+ # Output paths
133
+ out_folder = results_dir.joinpath(model_name)
134
+ out_folder.mkdir(mode=0o775, parents=True, exist_ok=True)
135
+
136
+ # Samples selection
137
+ if max_num_videos_per_label and max_num_videos_per_label > 0:
138
+ dfs_out_train = [select_videos(df, max_num_videos_per_label) for df in train_dfs]
139
+ dfs_out_val = [select_videos(df, max_num_videos_per_label) for df in val_dfs]
140
+ dfs_out_test = [select_videos(df, max_num_videos_per_label) for df in test_dfs]
141
+ else:
142
+ dfs_out_train = train_dfs
143
+ dfs_out_val = val_dfs
144
+ dfs_out_test = test_dfs
145
+
146
+ # Extractions list
147
+ extr_list = []
148
+ # Append train and validation set first
149
+ if 'train' in test_splits:
150
+ for idx, dataset in enumerate(test_sets):
151
+ extr_list.append(
152
+ (dfs_out_train[idx], out_folder.joinpath(dataset + '_train.pkl'), train_roots[idx], dataset + ' TRAIN')
153
+ )
154
+ if 'val' in test_splits:
155
+ for idx, dataset in enumerate(test_sets):
156
+ extr_list.append(
157
+ (dfs_out_val[idx], out_folder.joinpath(dataset + '_val.pkl'), val_roots[idx], dataset + ' VAL')
158
+ )
159
+ if 'test' in test_splits:
160
+ for idx, dataset in enumerate(test_sets):
161
+ extr_list.append(
162
+ (dfs_out_test[idx], out_folder.joinpath(dataset + '_test.pkl'), test_roots[idx], dataset + ' TEST')
163
+ )
164
+
165
+ for df, df_path, df_root, tag in extr_list:
166
+ if override or not df_path.exists():
167
+ print('\n##### PREDICT VIDEOS FROM {} #####'.format(tag))
168
+ print('Real frames: {}'.format(sum(df['label'] == False)))
169
+ print('Fake frames: {}'.format(sum(df['label'] == True)))
170
+ print('Real videos: {}'.format(df[df['label'] == False]['video'].nunique()))
171
+ print('Fake videos: {}'.format(df[df['label'] == True]['video'].nunique()))
172
+ dataset_out = process_dataset(root=df_root, df=df, net=net, criterion=criterion,
173
+ patch_size=patch_size,
174
+ face_policy=face_policy, transformer=test_transformer,
175
+ batch_size=batch_size,
176
+ num_workers=num_workers, device=device, )
177
+ df['score'] = dataset_out['score'].astype(np.float32)
178
+ df['loss'] = dataset_out['loss'].astype(np.float32)
179
+ print('Saving results to: {}'.format(df_path))
180
+ df.to_pickle(str(df_path))
181
+
182
+ if debug:
183
+ plt.figure()
184
+ plt.title(tag)
185
+ plt.hist(df[df.label == True].score, bins=100, alpha=0.6, label='FAKE frames')
186
+ plt.hist(df[df.label == False].score, bins=100, alpha=0.6, label='REAL frames')
187
+ plt.legend()
188
+
189
+ del (dataset_out)
190
+ del (df)
191
+ gc.collect()
192
+
193
+ if debug:
194
+ plt.show()
195
+
196
+ print('Completed!')
197
+
198
+
199
+ def process_dataset(df: pd.DataFrame,
200
+ root: str,
201
+ net: FeatureExtractor,
202
+ criterion,
203
+ patch_size: int,
204
+ face_policy: str,
205
+ transformer: A.BasicTransform,
206
+ batch_size: int,
207
+ num_workers: int,
208
+ device: torch.device,
209
+ ) -> dict:
210
+ if isinstance(device, (int, str)):
211
+ device = torch.device(device)
212
+
213
+ dataset = FrameFaceDatasetTest(
214
+ root=root,
215
+ df=df,
216
+ size=patch_size,
217
+ scale=face_policy,
218
+ transformer=transformer,
219
+ )
220
+
221
+ # Preallocate
222
+ score = np.zeros(len(df))
223
+ loss = np.zeros(len(df))
224
+
225
+ loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, drop_last=False)
226
+ with torch.no_grad():
227
+ idx0 = 0
228
+ for batch_data in tqdm(loader):
229
+ batch_images = batch_data[0].to(device)
230
+ batch_labels = batch_data[1].to(device)
231
+ batch_samples = len(batch_images)
232
+ batch_out = net(batch_images)
233
+ batch_loss = criterion(batch_out, batch_labels)
234
+ score[idx0:idx0 + batch_samples] = batch_out.cpu().numpy()[:, 0]
235
+ loss[idx0:idx0 + batch_samples] = batch_loss.cpu().numpy()[:, 0]
236
+ idx0 += batch_samples
237
+
238
+ out_dict = {'score': score, 'loss': loss}
239
+ return out_dict
240
+
241
+
242
+ def select_videos(df: pd.DataFrame, max_videos_per_label: int) -> pd.DataFrame:
243
+ """
244
+ Select up to a maximum number of videos
245
+ :param df: DataFrame of frames. Required columns: 'video','label'
246
+ :param max_videos_per_label: maximum number of real and fake videos
247
+ :return: DataFrame of selected frames
248
+ """
249
+ # Save random state
250
+ st0 = np.random.get_state()
251
+ # Set seed for this selection only
252
+ np.random.seed(42)
253
+
254
+ df_fake = df[df.label == True]
255
+ fake_videos = df_fake['video'].unique()
256
+ selected_fake_videos = np.random.choice(fake_videos, min(max_videos_per_label, len(fake_videos)), replace=False)
257
+ df_selected_fake_frames = df_fake[df_fake['video'].isin(selected_fake_videos)]
258
+
259
+ df_real = df[df.label == False]
260
+ real_videos = df_real['video'].unique()
261
+ selected_real_videos = np.random.choice(real_videos, min(max_videos_per_label, len(real_videos)), replace=False)
262
+ df_selected_real_frames = df_real[df_real['video'].isin(selected_real_videos)]
263
+ # Restore random state
264
+ np.random.set_state(st0)
265
+
266
+ return pd.concat((df_selected_fake_frames, df_selected_real_frames), axis=0, verify_integrity=True).copy()
267
+
268
+
269
+ if __name__ == '__main__':
270
+ main()
models/icpr2020dfdc/train_binclass.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+ import argparse
13
+ import os
14
+ import shutil
15
+ import warnings
16
+
17
+ import albumentations as A
18
+ import numpy as np
19
+ import pandas as pd
20
+ import torch
21
+ import torch.multiprocessing
22
+ from torchvision.transforms import ToPILImage, ToTensor
23
+
24
+ from isplutils import utils, split
25
+
26
+ torch.multiprocessing.set_sharing_strategy('file_system')
27
+ import torch.nn as nn
28
+ from albumentations.pytorch import ToTensorV2
29
+ from sklearn.metrics import roc_auc_score
30
+ from tensorboardX import SummaryWriter
31
+ from torch import optim
32
+ from torch.utils.data import DataLoader
33
+ from tqdm import tqdm
34
+ from PIL import ImageChops, Image
35
+
36
+ from architectures import fornet
37
+ from isplutils.data import FrameFaceIterableDataset, load_face
38
+
39
+
40
+ def main():
41
+ # Args
42
+ parser = argparse.ArgumentParser()
43
+ parser.add_argument('--net', type=str, help='Net model class', required=True)
44
+ parser.add_argument('--traindb', type=str, help='Training datasets', nargs='+', choices=split.available_datasets,
45
+ required=True)
46
+ parser.add_argument('--valdb', type=str, help='Validation datasets', nargs='+', choices=split.available_datasets,
47
+ required=True)
48
+ parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
49
+ help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
50
+ 'Required for training/validating on the DFDC dataset.')
51
+ parser.add_argument('--dfdc_faces_dir', type=str, action='store',
52
+ help='Path to the directory containing the faces extracted from the DFDC dataset. '
53
+ 'Required for training/validating on the DFDC dataset.')
54
+ parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
55
+ help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
56
+ 'Required for training/validating on the FF++ dataset.')
57
+ parser.add_argument('--ffpp_faces_dir', type=str, action='store',
58
+ help='Path to the directory containing the faces extracted from the FF++ dataset. '
59
+ 'Required for training/validating on the FF++ dataset.')
60
+ parser.add_argument('--face', type=str, help='Face crop or scale', required=True,
61
+ choices=['scale', 'tight'])
62
+ parser.add_argument('--size', type=int, help='Train patch size', required=True)
63
+
64
+ parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=32)
65
+ parser.add_argument('--lr', type=float, default=1e-5, help='Learning rate')
66
+ parser.add_argument('--valint', type=int, help='Validation interval (iterations)', default=500)
67
+ parser.add_argument('--patience', type=int, help='Patience before dropping the LR [validation intervals]',
68
+ default=10)
69
+ parser.add_argument('--maxiter', type=int, help='Maximum number of iterations', default=20000)
70
+ parser.add_argument('--init', type=str, help='Weight initialization file')
71
+ parser.add_argument('--scratch', action='store_true', help='Train from scratch')
72
+
73
+ parser.add_argument('--trainsamples', type=int, help='Limit the number of train samples per epoch', default=-1)
74
+ parser.add_argument('--valsamples', type=int, help='Limit the number of validation samples per epoch',
75
+ default=6000)
76
+
77
+ parser.add_argument('--logint', type=int, help='Training log interval (iterations)', default=100)
78
+ parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
79
+ parser.add_argument('--device', type=int, help='GPU device id', default=0)
80
+ parser.add_argument('--seed', type=int, help='Random seed', default=0)
81
+
82
+ parser.add_argument('--debug', action='store_true', help='Activate debug')
83
+ parser.add_argument('--suffix', type=str, help='Suffix to default tag')
84
+
85
+ parser.add_argument('--attention', action='store_true',
86
+ help='Enable Tensorboard log of attention masks')
87
+ parser.add_argument('--log_dir', type=str, help='Directory for saving the training logs',
88
+ default='runs/binclass/')
89
+ parser.add_argument('--models_dir', type=str, help='Directory for saving the models weights',
90
+ default='weights/binclass/')
91
+
92
+ args = parser.parse_args()
93
+
94
+ # Parse arguments
95
+ net_class = getattr(fornet, args.net)
96
+ train_datasets = args.traindb
97
+ val_datasets = args.valdb
98
+ dfdc_df_path = args.dfdc_faces_df_path
99
+ ffpp_df_path = args.ffpp_faces_df_path
100
+ dfdc_faces_dir = args.dfdc_faces_dir
101
+ ffpp_faces_dir = args.ffpp_faces_dir
102
+ face_policy = args.face
103
+ face_size = args.size
104
+
105
+ batch_size = args.batch
106
+ initial_lr = args.lr
107
+ validation_interval = args.valint
108
+ patience = args.patience
109
+ max_num_iterations = args.maxiter
110
+ initial_model = args.init
111
+ train_from_scratch = args.scratch
112
+
113
+ max_train_samples = args.trainsamples
114
+ max_val_samples = args.valsamples
115
+
116
+ log_interval = args.logint
117
+ num_workers = args.workers
118
+ device = torch.device('cuda:{:d}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
119
+ seed = args.seed
120
+
121
+ debug = args.debug
122
+ suffix = args.suffix
123
+
124
+ enable_attention = args.attention
125
+
126
+ weights_folder = args.models_dir
127
+ logs_folder = args.log_dir
128
+
129
+ # Random initialization
130
+ np.random.seed(seed)
131
+ torch.random.manual_seed(seed)
132
+
133
+ # Load net
134
+ net: nn.Module = net_class().to(device)
135
+
136
+ # Loss and optimizers
137
+ criterion = nn.BCEWithLogitsLoss()
138
+
139
+ min_lr = initial_lr * 1e-5
140
+ optimizer = optim.Adam(net.get_trainable_parameters(), lr=initial_lr)
141
+ lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
142
+ optimizer=optimizer,
143
+ mode='min',
144
+ factor=0.1,
145
+ patience=patience,
146
+ cooldown=2 * patience,
147
+ min_lr=min_lr,
148
+ )
149
+
150
+ tag = utils.make_train_tag(net_class=net_class,
151
+ traindb=train_datasets,
152
+ face_policy=face_policy,
153
+ patch_size=face_size,
154
+ seed=seed,
155
+ suffix=suffix,
156
+ debug=debug,
157
+ )
158
+
159
+ # Model checkpoint paths
160
+ bestval_path = os.path.join(weights_folder, tag, 'bestval.pth')
161
+ last_path = os.path.join(weights_folder, tag, 'last.pth')
162
+ periodic_path = os.path.join(weights_folder, tag, 'it{:06d}.pth')
163
+
164
+ os.makedirs(os.path.join(weights_folder, tag), exist_ok=True)
165
+
166
+ # Load model
167
+ val_loss = min_val_loss = 10
168
+ epoch = iteration = 0
169
+ net_state = None
170
+ opt_state = None
171
+ if initial_model is not None:
172
+ # If given load initial model
173
+ print('Loading model form: {}'.format(initial_model))
174
+ state = torch.load(initial_model, map_location='cpu')
175
+ net_state = state['net']
176
+ elif not train_from_scratch and os.path.exists(last_path):
177
+ print('Loading model form: {}'.format(last_path))
178
+ state = torch.load(last_path, map_location='cpu')
179
+ net_state = state['net']
180
+ opt_state = state['opt']
181
+ iteration = state['iteration'] + 1
182
+ epoch = state['epoch']
183
+ if not train_from_scratch and os.path.exists(bestval_path):
184
+ state = torch.load(bestval_path, map_location='cpu')
185
+ min_val_loss = state['val_loss']
186
+ if net_state is not None:
187
+ incomp_keys = net.load_state_dict(net_state, strict=False)
188
+ print(incomp_keys)
189
+ if opt_state is not None:
190
+ for param_group in opt_state['param_groups']:
191
+ param_group['lr'] = initial_lr
192
+ optimizer.load_state_dict(opt_state)
193
+
194
+ # Initialize Tensorboard
195
+ logdir = os.path.join(logs_folder, tag)
196
+ if iteration == 0:
197
+ # If training from scratch or initialization remove history if exists
198
+ shutil.rmtree(logdir, ignore_errors=True)
199
+
200
+ # TensorboardX instance
201
+ tb = SummaryWriter(logdir=logdir)
202
+ if iteration == 0:
203
+ dummy = torch.randn((1, 3, face_size, face_size), device=device)
204
+ dummy = dummy.to(device)
205
+ with warnings.catch_warnings():
206
+ warnings.simplefilter("ignore")
207
+ tb.add_graph(net, [dummy, ], verbose=False)
208
+
209
+ transformer = utils.get_transformer(face_policy=face_policy, patch_size=face_size,
210
+ net_normalizer=net.get_normalizer(), train=True)
211
+
212
+ # Datasets and data loaders
213
+ print('Loading data')
214
+ # Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
215
+ for dataset in train_datasets:
216
+ if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
217
+ raise RuntimeError('Specify DataFrame and directory for DFDC faces for training!')
218
+ elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
219
+ raise RuntimeError('Specify DataFrame and directory for FF++ faces for training!')
220
+ for dataset in val_datasets:
221
+ if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
222
+ raise RuntimeError('Specify DataFrame and directory for DFDC faces for validation!')
223
+ elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
224
+ raise RuntimeError('Specify DataFrame and directory for FF++ faces for validation!')
225
+ # Load splits with the make_splits function
226
+ splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir, ffpp_dir=ffpp_faces_dir,
227
+ dbs={'train': train_datasets, 'val': val_datasets})
228
+ train_dfs = [splits['train'][db][0] for db in splits['train']]
229
+ train_roots = [splits['train'][db][1] for db in splits['train']]
230
+ val_roots = [splits['val'][db][1] for db in splits['val']]
231
+ val_dfs = [splits['val'][db][0] for db in splits['val']]
232
+
233
+ train_dataset = FrameFaceIterableDataset(roots=train_roots,
234
+ dfs=train_dfs,
235
+ scale=face_policy,
236
+ num_samples=max_train_samples,
237
+ transformer=transformer,
238
+ size=face_size,
239
+ )
240
+
241
+ val_dataset = FrameFaceIterableDataset(roots=val_roots,
242
+ dfs=val_dfs,
243
+ scale=face_policy,
244
+ num_samples=max_val_samples,
245
+ transformer=transformer,
246
+ size=face_size,
247
+ )
248
+
249
+ train_loader = DataLoader(train_dataset, num_workers=num_workers, batch_size=batch_size, )
250
+
251
+ val_loader = DataLoader(val_dataset, num_workers=num_workers, batch_size=batch_size, )
252
+
253
+ print('Training samples: {}'.format(len(train_dataset)))
254
+ print('Validation samples: {}'.format(len(val_dataset)))
255
+
256
+ if len(train_dataset) == 0:
257
+ print('No training samples. Halt.')
258
+ return
259
+
260
+ if len(val_dataset) == 0:
261
+ print('No validation samples. Halt.')
262
+ return
263
+
264
+ stop = False
265
+ while not stop:
266
+
267
+ # Training
268
+ optimizer.zero_grad()
269
+
270
+ train_loss = train_num = 0
271
+ train_pred_list = []
272
+ train_labels_list = []
273
+ for train_batch in tqdm(train_loader, desc='Epoch {:03d}'.format(epoch), leave=False,
274
+ total=len(train_loader) // train_loader.batch_size):
275
+ net.train()
276
+ batch_data, batch_labels = train_batch
277
+
278
+ train_batch_num = len(batch_labels)
279
+ train_num += train_batch_num
280
+ train_labels_list.append(batch_labels.numpy().flatten())
281
+
282
+ train_batch_loss, train_batch_pred = batch_forward(net, device, criterion, batch_data, batch_labels)
283
+ train_pred_list.append(train_batch_pred.flatten())
284
+
285
+ if torch.isnan(train_batch_loss):
286
+ raise ValueError('NaN loss')
287
+
288
+ train_loss += train_batch_loss.item() * train_batch_num
289
+
290
+ # Optimization
291
+ train_batch_loss.backward()
292
+ optimizer.step()
293
+ optimizer.zero_grad()
294
+
295
+ # Logging
296
+ if iteration > 0 and (iteration % log_interval == 0):
297
+ train_loss /= train_num
298
+ tb.add_scalar('train/loss', train_loss, iteration)
299
+ tb.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration)
300
+ tb.add_scalar('epoch', epoch, iteration)
301
+
302
+ # Checkpoint
303
+ save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, last_path)
304
+ train_loss = train_num = 0
305
+
306
+ # Validation
307
+ if iteration > 0 and (iteration % validation_interval == 0):
308
+
309
+ # Model checkpoint
310
+ save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch,
311
+ periodic_path.format(iteration))
312
+
313
+ # Train cumulative stats
314
+ train_labels = np.concatenate(train_labels_list)
315
+ train_pred = np.concatenate(train_pred_list)
316
+ train_labels_list = []
317
+ train_pred_list = []
318
+
319
+ train_roc_auc = roc_auc_score(train_labels, train_pred)
320
+ tb.add_scalar('train/roc_auc', train_roc_auc, iteration)
321
+ tb.add_pr_curve('train/pr', train_labels, train_pred, iteration)
322
+
323
+ # Validation
324
+ val_loss = validation_routine(net, device, val_loader, criterion, tb, iteration, 'val')
325
+ tb.flush()
326
+
327
+ # LR Scheduler
328
+ lr_scheduler.step(val_loss)
329
+
330
+ # Model checkpoint
331
+ if val_loss < min_val_loss:
332
+ min_val_loss = val_loss
333
+ save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, bestval_path)
334
+
335
+ # Attention
336
+ if enable_attention and hasattr(net, 'get_attention'):
337
+ net.eval()
338
+ # For each dataframe show the attention for a real,fake couple of frames
339
+ for df, root, sample_idx, tag in [
340
+ (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == False].index[0],
341
+ 'train/att/real'),
342
+ (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == True].index[0],
343
+ 'train/att/fake'),
344
+ ]:
345
+ record = df.loc[sample_idx]
346
+ tb_attention(tb, tag, iteration, net, device, face_size, face_policy,
347
+ transformer, root, record)
348
+
349
+ if optimizer.param_groups[0]['lr'] == min_lr:
350
+ print('Reached minimum learning rate. Stopping.')
351
+ stop = True
352
+ break
353
+
354
+ iteration += 1
355
+
356
+ if iteration > max_num_iterations:
357
+ print('Maximum number of iterations reached')
358
+ stop = True
359
+ break
360
+
361
+ # End of iteration
362
+
363
+ epoch += 1
364
+
365
+ # Needed to flush out last events
366
+ tb.close()
367
+
368
+ print('Completed')
369
+
370
+
371
+ def tb_attention(tb: SummaryWriter,
372
+ tag: str,
373
+ iteration: int,
374
+ net: nn.Module,
375
+ device: torch.device,
376
+ patch_size_load: int,
377
+ face_crop_scale: str,
378
+ val_transformer: A.BasicTransform,
379
+ root: str,
380
+ record: pd.Series,
381
+ ):
382
+ # Crop face
383
+ sample_t = load_face(record=record, root=root, size=patch_size_load, scale=face_crop_scale,
384
+ transformer=val_transformer)
385
+ sample_t_clean = load_face(record=record, root=root, size=patch_size_load, scale=face_crop_scale,
386
+ transformer=ToTensorV2())
387
+ if torch.cuda.is_available():
388
+ sample_t = sample_t.cuda(device)
389
+ # Transform
390
+ # Feed to net
391
+ with torch.no_grad():
392
+ att: torch.Tensor = net.get_attention(sample_t.unsqueeze(0))[0].cpu()
393
+ att_img: Image.Image = ToPILImage()(att)
394
+ sample_img = ToPILImage()(sample_t_clean)
395
+ att_img = att_img.resize(sample_img.size, resample=Image.NEAREST).convert('RGB')
396
+ sample_att_img = ImageChops.multiply(sample_img, att_img)
397
+ sample_att = ToTensor()(sample_att_img)
398
+ tb.add_image(tag=tag, img_tensor=sample_att, global_step=iteration)
399
+
400
+
401
+ def batch_forward(net: nn.Module, device: torch.device, criterion, data: torch.Tensor, labels: torch.Tensor) -> (
402
+ torch.Tensor, float, int):
403
+ data = data.to(device)
404
+ labels = labels.to(device)
405
+ out = net(data)
406
+ pred = torch.sigmoid(out).detach().cpu().numpy()
407
+ loss = criterion(out, labels)
408
+ return loss, pred
409
+
410
+
411
+ def validation_routine(net, device, val_loader, criterion, tb, iteration, tag: str, loader_len_norm: int = None):
412
+ net.eval()
413
+ loader_len_norm = loader_len_norm if loader_len_norm is not None else val_loader.batch_size
414
+ val_num = 0
415
+ val_loss = 0.
416
+ pred_list = list()
417
+ labels_list = list()
418
+ for val_data in tqdm(val_loader, desc='Validation', leave=False, total=len(val_loader) // loader_len_norm):
419
+ batch_data, batch_labels = val_data
420
+
421
+ val_batch_num = len(batch_labels)
422
+ labels_list.append(batch_labels.flatten())
423
+ with torch.no_grad():
424
+ val_batch_loss, val_batch_pred = batch_forward(net, device, criterion, batch_data,
425
+ batch_labels)
426
+ pred_list.append(val_batch_pred.flatten())
427
+ val_num += val_batch_num
428
+ val_loss += val_batch_loss.item() * val_batch_num
429
+
430
+ # Logging
431
+ val_loss /= val_num
432
+ tb.add_scalar('{}/loss'.format(tag), val_loss, iteration)
433
+
434
+ if isinstance(criterion, nn.BCEWithLogitsLoss):
435
+ val_labels = np.concatenate(labels_list)
436
+ val_pred = np.concatenate(pred_list)
437
+ val_roc_auc = roc_auc_score(val_labels, val_pred)
438
+ tb.add_scalar('{}/roc_auc'.format(tag), val_roc_auc, iteration)
439
+ tb.add_pr_curve('{}/pr'.format(tag), val_labels, val_pred, iteration)
440
+
441
+ return val_loss
442
+
443
+
444
+ def save_model(net: nn.Module, optimizer: optim.Optimizer,
445
+ train_loss: float, val_loss: float,
446
+ iteration: int, batch_size: int, epoch: int,
447
+ path: str):
448
+ path = str(path)
449
+ state = dict(net=net.state_dict(),
450
+ opt=optimizer.state_dict(),
451
+ train_loss=train_loss,
452
+ val_loss=val_loss,
453
+ iteration=iteration,
454
+ batch_size=batch_size,
455
+ epoch=epoch)
456
+ torch.save(state, path)
457
+
458
+
459
+ if __name__ == '__main__':
460
+ main()
models/icpr2020dfdc/train_triplet.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Video Face Manipulation Detection Through Ensemble of CNNs
3
+
4
+ Image and Sound Processing Lab - Politecnico di Milano
5
+
6
+ Nicolò Bonettini
7
+ Edoardo Daniele Cannas
8
+ Sara Mandelli
9
+ Luca Bondi
10
+ Paolo Bestagini
11
+ """
12
+ import argparse
13
+ import os
14
+ import shutil
15
+ import warnings
16
+
17
+ import numpy as np
18
+ import torch
19
+ import torch.multiprocessing
20
+
21
+ torch.multiprocessing.set_sharing_strategy('file_system')
22
+ import torch.nn as nn
23
+ import torch.optim as optim
24
+ from tensorboardX import SummaryWriter
25
+ from torch.utils.data import DataLoader
26
+ from tqdm import tqdm
27
+
28
+ from architectures import tripletnet
29
+ from train_binclass import save_model, tb_attention
30
+ from isplutils.data import FrameFaceIterableDataset
31
+ from isplutils.data_siamese import FrameFaceTripletIterableDataset
32
+ from isplutils import split, utils
33
+
34
+
35
+ def main():
36
+ # Args
37
+ parser = argparse.ArgumentParser()
38
+ parser.add_argument('--net', type=str, help='Net model class', required=True)
39
+ parser.add_argument('--traindb', type=str, help='Training datasets', nargs='+', choices=split.available_datasets,
40
+ required=True)
41
+ parser.add_argument('--valdb', type=str, help='Validation datasets', nargs='+', choices=split.available_datasets,
42
+ required=True)
43
+ parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
44
+ help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
45
+ 'Required for training/validating on the DFDC dataset.')
46
+ parser.add_argument('--dfdc_faces_dir', type=str, action='store',
47
+ help='Path to the directory containing the faces extracted from the DFDC dataset. '
48
+ 'Required for training/validating on the DFDC dataset.')
49
+ parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
50
+ help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
51
+ 'Required for training/validating on the FF++ dataset.')
52
+ parser.add_argument('--ffpp_faces_dir', type=str, action='store',
53
+ help='Path to the directory containing the faces extracted from the FF++ dataset. '
54
+ 'Required for training/validating on the FF++ dataset.')
55
+ parser.add_argument('--face', type=str, help='Face crop or scale', required=True,
56
+ choices=['scale', 'tight'])
57
+ parser.add_argument('--size', type=int, help='Train patch size', required=True)
58
+
59
+ parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=12)
60
+ parser.add_argument('--lr', type=float, default=1e-5, help='Learning rate')
61
+ parser.add_argument('--valint', type=int, help='Validation interval (iterations)', default=500)
62
+ parser.add_argument('--patience', type=int, help='Patience before dropping the LR [validation intervals]',
63
+ default=10)
64
+ parser.add_argument('--maxiter', type=int, help='Maximum number of iterations', default=20000)
65
+ parser.add_argument('--init', type=str, help='Weight initialization file')
66
+ parser.add_argument('--scratch', action='store_true', help='Train from scratch')
67
+
68
+ parser.add_argument('--traintriplets', type=int, help='Limit the number of train triplets per epoch', default=-1)
69
+ parser.add_argument('--valtriplets', type=int, help='Limit the number of validation triplets per epoch',
70
+ default=2000)
71
+
72
+ parser.add_argument('--logint', type=int, help='Training log interval (iterations)', default=100)
73
+ parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
74
+ parser.add_argument('--device', type=int, help='GPU device id', default=0)
75
+ parser.add_argument('--seed', type=int, help='Random seed', default=0)
76
+
77
+ parser.add_argument('--debug', action='store_true', help='Activate debug')
78
+ parser.add_argument('--suffix', type=str, help='Suffix to default tag')
79
+
80
+ parser.add_argument('--attention', action='store_true',
81
+ help='Enable Tensorboard log of attention masks')
82
+ parser.add_argument('--embedding', action='store_true', help='Activate embedding visualization in TensorBoard')
83
+ parser.add_argument('--embeddingint', type=int, help='Embedding visualization interval in TensorBoard',
84
+ default=5000)
85
+
86
+ parser.add_argument('--log_dir', type=str, help='Directory for saving the training logs',
87
+ default='runs/triplet/')
88
+ parser.add_argument('--models_dir', type=str, help='Directory for saving the models weights',
89
+ default='weights/triplet/')
90
+
91
+ args = parser.parse_args()
92
+
93
+ # Parse arguments
94
+ net_class = getattr(tripletnet, args.net)
95
+ train_datasets = args.traindb
96
+ val_datasets = args.valdb
97
+ dfdc_df_path = args.dfdc_faces_df_path
98
+ ffpp_df_path = args.ffpp_faces_df_path
99
+ dfdc_faces_dir = args.dfdc_faces_dir
100
+ ffpp_faces_dir = args.ffpp_faces_dir
101
+ face_policy = args.face
102
+ face_size = args.size
103
+
104
+ batch_size = args.batch
105
+ initial_lr = args.lr
106
+ validation_interval = args.valint
107
+ patience = args.patience
108
+ max_num_iterations = args.maxiter
109
+ initial_model = args.init
110
+ train_from_scratch = args.scratch
111
+
112
+ max_train_triplets = args.traintriplets
113
+ max_val_triplets = args.valtriplets
114
+
115
+ log_interval = args.logint
116
+ num_workers = args.workers
117
+ device = torch.device('cuda:{:d}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
118
+ seed = args.seed
119
+
120
+ debug = args.debug
121
+ suffix = args.suffix
122
+
123
+ enable_attention = args.attention
124
+ enable_embedding = args.embedding
125
+ embedding_interval = args.embeddingint
126
+
127
+ weights_folder = args.models_dir
128
+ logs_folder = args.log_dir
129
+
130
+ # Random initialization
131
+ np.random.seed(seed)
132
+ torch.random.manual_seed(seed)
133
+
134
+ # Load net
135
+ net: nn.Module = net_class().to(device)
136
+
137
+ # Loss and optimizers
138
+ criterion = nn.TripletMarginLoss()
139
+
140
+ min_lr = initial_lr * 1e-5
141
+ optimizer = optim.Adam(net.get_trainable_parameters(), lr=initial_lr)
142
+ lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
143
+ optimizer=optimizer,
144
+ mode='min',
145
+ factor=0.1,
146
+ patience=patience,
147
+ cooldown=2 * patience,
148
+ min_lr=min_lr,
149
+ )
150
+
151
+ tag = utils.make_train_tag(net_class=net_class,
152
+ traindb=train_datasets,
153
+ face_policy=face_policy,
154
+ patch_size=face_size,
155
+ seed=seed,
156
+ suffix=suffix,
157
+ debug=debug,
158
+ )
159
+
160
+ # Model checkpoint paths
161
+ bestval_path = os.path.join(weights_folder, tag, 'bestval.pth')
162
+ last_path = os.path.join(weights_folder, tag, 'last.pth')
163
+ periodic_path = os.path.join(weights_folder, tag, 'it{:06d}.pth')
164
+
165
+ os.makedirs(os.path.join(weights_folder, tag), exist_ok=True)
166
+
167
+ # Load model
168
+ val_loss = min_val_loss = 20
169
+ epoch = iteration = 0
170
+ net_state = None
171
+ opt_state = None
172
+ if initial_model is not None:
173
+ # If given load initial model
174
+ print('Loading model form: {}'.format(initial_model))
175
+ state = torch.load(initial_model, map_location='cpu')
176
+ net_state = state['net']
177
+ elif not train_from_scratch and os.path.exists(last_path):
178
+ print('Loading model form: {}'.format(last_path))
179
+ state = torch.load(last_path, map_location='cpu')
180
+ net_state = state['net']
181
+ opt_state = state['opt']
182
+ iteration = state['iteration'] + 1
183
+ epoch = state['epoch']
184
+ if not train_from_scratch and os.path.exists(bestval_path):
185
+ state = torch.load(bestval_path, map_location='cpu')
186
+ min_val_loss = state['val_loss']
187
+ if net_state is not None:
188
+ adapt_binclass_model(net_state)
189
+ incomp_keys = net.load_state_dict(net_state, strict=False)
190
+ print(incomp_keys)
191
+ if opt_state is not None:
192
+ for param_group in opt_state['param_groups']:
193
+ param_group['lr'] = initial_lr
194
+ optimizer.load_state_dict(opt_state)
195
+
196
+ # Initialize Tensorboard
197
+ logdir = os.path.join(logs_folder, tag)
198
+ if iteration == 0:
199
+ # If training from scratch or initialization remove history if exists
200
+ shutil.rmtree(logdir, ignore_errors=True)
201
+
202
+ # TensorboardX instance
203
+ tb = SummaryWriter(logdir=logdir)
204
+ if iteration == 0:
205
+ dummy = torch.randn((1, 3, face_size, face_size), device=device)
206
+ with warnings.catch_warnings():
207
+ warnings.simplefilter("ignore")
208
+ tb.add_graph(net, [dummy, dummy, dummy], verbose=False)
209
+
210
+ transformer = utils.get_transformer(face_policy=face_policy, patch_size=face_size,
211
+ net_normalizer=net.get_normalizer(), train=True)
212
+
213
+ # Datasets and data loaders
214
+ print('Loading data')
215
+ # Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
216
+ for dataset in train_datasets:
217
+ if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
218
+ raise RuntimeError('Specify DataFrame and directory for DFDC faces for training!')
219
+ elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
220
+ raise RuntimeError('Specify DataFrame and directory for FF++ faces for training!')
221
+ for dataset in val_datasets:
222
+ if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
223
+ raise RuntimeError('Specify DataFrame and directory for DFDC faces for validation!')
224
+ elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
225
+ raise RuntimeError('Specify DataFrame and directory for FF++ faces for validation!')
226
+ splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir,
227
+ ffpp_dir=ffpp_faces_dir, dbs={'train': train_datasets, 'val': val_datasets})
228
+ train_dfs = [splits['train'][db][0] for db in splits['train']]
229
+ train_roots = [splits['train'][db][1] for db in splits['train']]
230
+ val_roots = [splits['val'][db][1] for db in splits['val']]
231
+ val_dfs = [splits['val'][db][0] for db in splits['val']]
232
+
233
+ train_dataset = FrameFaceTripletIterableDataset(roots=train_roots,
234
+ dfs=train_dfs,
235
+ scale=face_policy,
236
+ num_triplets=max_train_triplets,
237
+ transformer=transformer,
238
+ size=face_size,
239
+ )
240
+
241
+ val_dataset = FrameFaceTripletIterableDataset(roots=val_roots,
242
+ dfs=val_dfs,
243
+ scale=face_policy,
244
+ num_triplets=max_val_triplets,
245
+ transformer=transformer,
246
+ size=face_size,
247
+ )
248
+
249
+ train_loader = DataLoader(train_dataset, num_workers=num_workers, batch_size=batch_size, )
250
+
251
+ val_loader = DataLoader(val_dataset, num_workers=num_workers, batch_size=batch_size, )
252
+
253
+ print('Training triplets: {}'.format(len(train_dataset)))
254
+ print('Validation triplets: {}'.format(len(val_dataset)))
255
+
256
+ if len(train_dataset) == 0:
257
+ print('No training triplets. Halt.')
258
+ return
259
+
260
+ if len(val_dataset) == 0:
261
+ print('No validation triplets. Halt.')
262
+ return
263
+
264
+ # Embedding visualization
265
+ if enable_embedding:
266
+ train_dataset_embedding = FrameFaceIterableDataset(roots=train_roots,
267
+ dfs=train_dfs,
268
+ scale=face_policy,
269
+ num_samples=64,
270
+ transformer=transformer,
271
+ size=face_size,
272
+ )
273
+ train_loader_embedding = DataLoader(train_dataset_embedding, num_workers=num_workers, batch_size=batch_size, )
274
+ val_dataset_embedding = FrameFaceIterableDataset(roots=val_roots,
275
+ dfs=val_dfs,
276
+ scale=face_policy,
277
+ num_samples=64,
278
+ transformer=transformer,
279
+ size=face_size,
280
+ )
281
+ val_loader_embedding = DataLoader(val_dataset_embedding, num_workers=num_workers, batch_size=batch_size, )
282
+
283
+ else:
284
+ train_loader_embedding = None
285
+ val_loader_embedding = None
286
+
287
+ stop = False
288
+ while not stop:
289
+
290
+ # Training
291
+ optimizer.zero_grad()
292
+
293
+ train_loss = train_num = 0
294
+ for train_batch in tqdm(train_loader, desc='Epoch {:03d}'.format(epoch), leave=False,
295
+ total=len(train_loader) // train_loader.batch_size):
296
+ net.train()
297
+ train_batch_num = len(train_batch[0])
298
+ train_num += train_batch_num
299
+
300
+ train_batch_loss = batch_forward(net, device, criterion, train_batch)
301
+
302
+ if torch.isnan(train_batch_loss):
303
+ raise ValueError('NaN loss')
304
+
305
+ train_loss += train_batch_loss.item() * train_batch_num
306
+
307
+ # Optimization
308
+ train_batch_loss.backward()
309
+ optimizer.step()
310
+ optimizer.zero_grad()
311
+
312
+ # Logging
313
+ if iteration > 0 and (iteration % log_interval == 0):
314
+ train_loss /= train_num
315
+ tb.add_scalar('train/loss', train_loss, iteration)
316
+ tb.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration)
317
+ tb.add_scalar('epoch', epoch, iteration)
318
+
319
+ # Checkpoint
320
+ save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, last_path)
321
+ train_loss = train_num = 0
322
+
323
+ # Validation
324
+ if iteration > 0 and (iteration % validation_interval == 0):
325
+
326
+ # Validation
327
+ val_loss = validation_routine(net, device, val_loader, criterion, tb, iteration, tag='val')
328
+ tb.flush()
329
+
330
+ # LR Scheduler
331
+ lr_scheduler.step(val_loss)
332
+
333
+ # Model checkpoint
334
+ save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch,
335
+ periodic_path.format(iteration))
336
+ if val_loss < min_val_loss:
337
+ min_val_loss = val_loss
338
+ shutil.copy(periodic_path.format(iteration), bestval_path)
339
+
340
+ # Attention
341
+ if enable_attention and hasattr(net, 'feat_ext') and hasattr(net.feat_ext, 'get_attention'):
342
+ net.eval()
343
+ # For each dataframe show the attention for a real,fake couple of frames
344
+
345
+ for df, root, sample_idx, tag in [
346
+ (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == False].index[0],
347
+ 'train/att/real'),
348
+ (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == True].index[0],
349
+ 'train/att/fake'),
350
+ ]:
351
+ record = df.loc[sample_idx]
352
+ tb_attention(tb, tag, iteration, net.feat_ext, device, face_size, face_policy,
353
+ transformer, root, record)
354
+
355
+ if optimizer.param_groups[0]['lr'] <= min_lr:
356
+ print('Reached minimum learning rate. Stopping.')
357
+ stop = True
358
+ break
359
+
360
+ # Embedding visualization
361
+ if enable_embedding:
362
+ if iteration > 0 and (iteration % embedding_interval == 0):
363
+ embedding_routine(net=net,
364
+ device=device,
365
+ loader=train_loader_embedding,
366
+ iteration=iteration,
367
+ tb=tb,
368
+ tag=tag + '/train')
369
+ embedding_routine(net=net,
370
+ device=device,
371
+ loader=val_loader_embedding,
372
+ iteration=iteration,
373
+ tb=tb,
374
+ tag=tag + '/val')
375
+
376
+ iteration += 1
377
+
378
+ if iteration > max_num_iterations:
379
+ print('Maximum number of iterations reached')
380
+ stop = True
381
+ break
382
+
383
+ # End of iteration
384
+
385
+ epoch += 1
386
+
387
+ # Needed to flush out last events
388
+ tb.close()
389
+
390
+ print('Completed')
391
+
392
+
393
+ def adapt_binclass_model(net_state):
394
+ # Check that the model contains at least one key starting with feat_ext, otherwise adapt
395
+ found = False
396
+ for key in net_state:
397
+ if key.startswith('feat_ext.'):
398
+ found = True
399
+ break
400
+ if not found:
401
+ # Adapt all keys
402
+ print('Adapting keys')
403
+ keys = [k for k in net_state]
404
+ for key in keys:
405
+ net_state['feat_ext.{}'.format(key)] = net_state[key]
406
+ del net_state[key]
407
+
408
+
409
+ def batch_forward(net: nn.Module, device, criterion, data: tuple) -> torch.Tensor:
410
+ if torch.cuda.is_available():
411
+ data = [i.cuda(device) for i in data]
412
+ out = net(*data)
413
+ loss = criterion(*out)
414
+ return loss
415
+
416
+
417
+ def validation_routine(net, device, val_loader, criterion, tb, iteration, tag):
418
+ net.eval()
419
+
420
+ val_num = 0
421
+ val_loss = 0.
422
+ for val_data in tqdm(val_loader, desc='Validation', leave=False, total=len(val_loader) // val_loader.batch_size):
423
+ val_batch_num = len(val_data[0])
424
+ with torch.no_grad():
425
+ val_batch_loss = batch_forward(net, device, criterion, val_data, )
426
+ val_num += val_batch_num
427
+ val_loss += val_batch_loss.item() * val_batch_num
428
+
429
+ # Logging
430
+ val_loss /= val_num
431
+ tb.add_scalar('{}/loss'.format(tag), val_loss, iteration)
432
+
433
+ return val_loss
434
+
435
+
436
+ def embedding_routine(net: nn.Module, device: torch.device, loader: DataLoader, tb: SummaryWriter, iteration: int,
437
+ tag: str):
438
+ net.eval()
439
+
440
+ labels = []
441
+ embeddings = []
442
+ for batch_data in loader:
443
+ batch_faces, batch_labels = batch_data
444
+ if torch.cuda.is_available():
445
+ batch_faces = batch_faces.to(device)
446
+ with torch.no_grad():
447
+ batch_emb = net.features(batch_faces)
448
+ labels.append(batch_labels.numpy().flatten())
449
+ embeddings.append(torch.flatten(batch_emb.cpu(), start_dim=1).numpy())
450
+
451
+ labels = list(np.concatenate(labels))
452
+ embeddings = np.concatenate(embeddings)
453
+
454
+ # Logging
455
+ tb.add_embedding(mat=embeddings, metadata=labels, tag=tag, global_step=iteration)
456
+
457
+
458
+ if __name__ == '__main__':
459
+ main()
models/model_loader.py CHANGED
@@ -27,6 +27,7 @@ class ModelLoader:
27
  cls._instance._face_detector = None
28
  cls._instance._spacy_nlp = None
29
  cls._instance._sentence_transformer = None
 
30
  return cls._instance
31
 
32
  @classmethod
@@ -146,6 +147,23 @@ class ModelLoader:
146
  logger.info("MediaPipe FaceMesh loaded")
147
  return self._face_detector
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # ---------- Preload ----------
150
  def preload_phase1(self) -> None:
151
  """Preload only what Phase 1 needs (image model)."""
 
27
  cls._instance._face_detector = None
28
  cls._instance._spacy_nlp = None
29
  cls._instance._sentence_transformer = None
30
+ cls._instance._efficientnet_detector = None
31
  return cls._instance
32
 
33
  @classmethod
 
147
  logger.info("MediaPipe FaceMesh loaded")
148
  return self._face_detector
149
 
150
+ # ---------- EfficientNetAutoAttB4 (ICPR2020 / DeepShield1 merge) ----------
151
+ def load_efficientnet(self):
152
+ """Lazy-load EfficientNetAutoAttB4 detector. Returns None if deps are missing."""
153
+ if self._efficientnet_detector is None:
154
+ try:
155
+ from services.efficientnet_service import EfficientNetDetector
156
+
157
+ self._efficientnet_detector = EfficientNetDetector(
158
+ model_name=settings.EFFICIENTNET_MODEL,
159
+ train_db=settings.EFFICIENTNET_TRAIN_DB,
160
+ device=settings.DEVICE,
161
+ )
162
+ except Exception as e:
163
+ logger.warning(f"EfficientNet load failed (continuing without it): {e}")
164
+ return None
165
+ return self._efficientnet_detector
166
+
167
  # ---------- Preload ----------
168
  def preload_phase1(self) -> None:
169
  """Preload only what Phase 1 needs (image model)."""
requirements.txt CHANGED
@@ -11,6 +11,13 @@ alembic==1.13.3
11
  python-jose[cryptography]==3.3.0
12
  bcrypt==4.2.0
13
 
 
 
 
 
 
 
 
14
  # === Phase 1: Image Detection ===
15
  # Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
16
  torch==2.4.1
 
11
  python-jose[cryptography]==3.3.0
12
  bcrypt==4.2.0
13
 
14
+ # === MERGE: EfficientNetAutoAttB4 (DeepShield1 / ICPR2020) ===
15
+ albumentations>=1.3.0,<1.5 # Required by icpr2020dfdc isplutils transforms; pin to avoid 1.5+ API break
16
+ scipy>=1.13.0 # expit (sigmoid) for EfficientNet logit conversion
17
+ # NOTE: MERGE_PLAN §4 said NOT to install efficientnet-pytorch, but fornet.py imports it directly.
18
+ efficientnet-pytorch==0.7.1 # Required by icpr2020dfdc/architectures/fornet.py
19
+ psutil>=5.9.0 # RAM monitoring in smoke tests
20
+
21
  # === Phase 1: Image Detection ===
22
  # Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
23
  torch==2.4.1
schemas/common.py CHANGED
@@ -86,3 +86,4 @@ class ProcessingSummary(BaseModel):
86
  stages_completed: List[str]
87
  total_duration_ms: int
88
  model_used: str
 
 
86
  stages_completed: List[str]
87
  total_duration_ms: int
88
  model_used: str
89
+ models_used: List[str] = [] # all models that contributed (ensemble)
services/efficientnet_service.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """EfficientNetAutoAttB4 adapter — wraps ICPR2020 DFDC model into DeepShield service interface."""
2
+ from __future__ import annotations
3
+
4
+ import pickle
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import List, Optional
8
+
9
+ import numpy as np
10
+ import torch
11
+ from loguru import logger
12
+ from PIL import Image
13
+ from scipy.special import expit
14
+ from torch.utils.model_zoo import load_url
15
+
16
+ # Resolve ICPR2020 repo root and patch sys.path so its modules are importable.
17
+ _ICPR_ROOT = (Path(__file__).resolve().parent.parent / "models" / "icpr2020dfdc").resolve()
18
+ _NOTEBOOK_DIR = str(_ICPR_ROOT / "notebook")
19
+ if str(_ICPR_ROOT) not in sys.path:
20
+ sys.path.insert(0, str(_ICPR_ROOT))
21
+ if _NOTEBOOK_DIR not in sys.path:
22
+ sys.path.insert(0, _NOTEBOOK_DIR)
23
+
24
+ # These imports are valid only after the sys.path patch above.
25
+ from blazeface import BlazeFace, FaceExtractor # noqa: E402
26
+ from architectures import fornet, weights # noqa: E402
27
+ from isplutils import utils as ispl_utils # noqa: E402
28
+
29
+ # Default calibrator path — populated by scripts/fit_calibrator.py.
30
+ _CALIBRATOR_PATH = Path(__file__).resolve().parent.parent / "models" / "efficientnet_calibrator.pkl"
31
+
32
+
33
+ def _load_calibrator(path: Path = _CALIBRATOR_PATH):
34
+ """Load isotonic calibrator if it exists. Returns None otherwise."""
35
+ if not path.exists():
36
+ return None
37
+ try:
38
+ with path.open("rb") as f:
39
+ cal = pickle.load(f)
40
+ logger.info(f"Isotonic calibrator loaded from {path}")
41
+ return cal
42
+ except Exception as e:
43
+ logger.warning(f"Failed to load calibrator ({e}) — using raw sigmoid scores")
44
+ return None
45
+
46
+
47
+ class EfficientNetDetector:
48
+ """Thin adapter that loads EfficientNetAutoAttB4 (DFDC-trained) and exposes
49
+ detect_image() / detect_video_frames() matching DeepShield's service interface.
50
+
51
+ If backend/models/efficientnet_calibrator.pkl exists (produced by
52
+ scripts/fit_calibrator.py), raw sigmoid scores are passed through an isotonic
53
+ regression calibrator before being returned. Set calibrator=None to disable.
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ model_name: str = "EfficientNetAutoAttB4",
59
+ train_db: str = "DFDC",
60
+ device: str = "cpu",
61
+ calibrator_path: Optional[Path] = None,
62
+ ) -> None:
63
+ self.device = torch.device(device)
64
+ self.model_name = model_name
65
+ self.train_db = train_db
66
+
67
+ weight_key = f"{model_name}_{train_db}"
68
+ if weight_key not in weights.weight_url:
69
+ raise KeyError(f"Unknown model/DB combination: {weight_key}")
70
+
71
+ self.net = getattr(fornet, model_name)().eval().to(self.device)
72
+ # check_hash=False — the ISPL mirror occasionally has stale sha256 hashes in URLs.
73
+ state = load_url(weights.weight_url[weight_key], map_location=self.device, check_hash=False)
74
+ self.net.load_state_dict(state)
75
+
76
+ self.transf = ispl_utils.get_transformer(
77
+ "scale", 224, self.net.get_normalizer(), train=False
78
+ )
79
+
80
+ blazeface_dir = _ICPR_ROOT / "blazeface"
81
+ weights_path = blazeface_dir / "blazeface.pth"
82
+ anchors_path = blazeface_dir / "anchors.npy"
83
+ if not weights_path.exists() or not anchors_path.exists():
84
+ raise FileNotFoundError(
85
+ f"BlazeFace assets missing: expected {weights_path} and {anchors_path}. "
86
+ "Ensure icpr2020dfdc is cloned into backend/models/ with its blazeface/ subdirectory."
87
+ )
88
+
89
+ self.facedet = BlazeFace().to(self.device)
90
+ self.facedet.load_weights(str(weights_path))
91
+ self.facedet.load_anchors(str(anchors_path))
92
+ self.face_extractor = FaceExtractor(facedet=self.facedet)
93
+
94
+ self.calibrator = _load_calibrator(calibrator_path or _CALIBRATOR_PATH)
95
+ self.calibrator_applied = self.calibrator is not None
96
+
97
+ logger.info(
98
+ f"EfficientNetDetector ready: {model_name}/{train_db} on {self.device} "
99
+ f"| calibrator={'yes' if self.calibrator_applied else 'no'}"
100
+ )
101
+
102
+ def _face_tensor(self, face_np: np.ndarray) -> torch.Tensor:
103
+ """Apply albumentations transform to a cropped face array and return a CHW tensor."""
104
+ result = self.transf(image=face_np)
105
+ return result["image"]
106
+
107
+ def _calibrate(self, score: float) -> float:
108
+ """Apply isotonic calibration if available; otherwise return score unchanged."""
109
+ if self.calibrator is None:
110
+ return score
111
+ try:
112
+ return float(self.calibrator.predict([[score]])[0])
113
+ except Exception:
114
+ return score
115
+
116
+ def _calibrate_batch(self, scores: np.ndarray) -> np.ndarray:
117
+ """Apply isotonic calibration to a 1-D array of scores."""
118
+ if self.calibrator is None:
119
+ return scores
120
+ try:
121
+ return self.calibrator.predict(scores.reshape(-1, 1)).flatten()
122
+ except Exception:
123
+ return scores
124
+
125
+ def raw_logit(self, face_tensor: torch.Tensor) -> float:
126
+ """Return raw logit for a single face tensor — used by fit_calibrator.py."""
127
+ with torch.inference_mode():
128
+ return float(self.net(face_tensor.unsqueeze(0).to(self.device)).item())
129
+
130
+ def detect_image(self, pil_image: Image.Image) -> dict:
131
+ """Run EfficientNet on a single PIL image.
132
+
133
+ Returns:
134
+ {"score": float|None, "result": "FAKE"|"REAL"|None, "model": str,
135
+ "error": str|None, "calibrator_applied": bool}
136
+ """
137
+ if pil_image.mode != "RGB":
138
+ pil_image = pil_image.convert("RGB")
139
+ img_array = np.array(pil_image)
140
+
141
+ frame_data = self.face_extractor.process_image(img=img_array)
142
+ faces: list = frame_data.get("faces", [])
143
+ if not faces:
144
+ logger.debug("EfficientNetDetector.detect_image: no face detected")
145
+ return {
146
+ "error": "no_face",
147
+ "score": None,
148
+ "result": None,
149
+ "model": f"{self.model_name}_{self.train_db}",
150
+ "calibrator_applied": False,
151
+ }
152
+
153
+ face_t = self._face_tensor(faces[0])
154
+ with torch.inference_mode():
155
+ logit = self.net(face_t.unsqueeze(0).to(self.device))
156
+ raw_score = float(torch.sigmoid(logit).item())
157
+
158
+ score = self._calibrate(raw_score)
159
+ return {
160
+ "score": score,
161
+ "result": "FAKE" if score > 0.5 else "REAL",
162
+ "model": f"{self.model_name}_{self.train_db}",
163
+ "error": None,
164
+ "calibrator_applied": self.calibrator_applied,
165
+ }
166
+
167
+ def detect_video_frames(self, frames: List[np.ndarray]) -> dict:
168
+ """Run EfficientNet on a list of BGR/RGB numpy frames (as extracted by OpenCV).
169
+
170
+ Returns:
171
+ {"mean_score": float|None, "per_frame": list[float], "model": str,
172
+ "error": str|None, "calibrator_applied": bool}
173
+ """
174
+ face_tensors: list[torch.Tensor] = []
175
+ for frame in frames:
176
+ # Ensure RGB — OpenCV yields BGR, PIL already RGB.
177
+ if frame.ndim == 3 and frame.shape[2] == 3:
178
+ frame_rgb = frame[..., ::-1].copy() if frame.dtype == np.uint8 else frame
179
+ else:
180
+ frame_rgb = frame
181
+ frame_data = self.face_extractor.process_image(img=frame_rgb)
182
+ faces: list = frame_data.get("faces", [])
183
+ if faces:
184
+ face_tensors.append(self._face_tensor(faces[0]))
185
+
186
+ if not face_tensors:
187
+ logger.debug("EfficientNetDetector.detect_video_frames: no faces in any frame")
188
+ return {
189
+ "error": "no_faces",
190
+ "mean_score": None,
191
+ "per_frame": [],
192
+ "model": f"{self.model_name}_{self.train_db}",
193
+ "calibrator_applied": False,
194
+ }
195
+
196
+ batch = torch.stack(face_tensors).to(self.device)
197
+ with torch.inference_mode():
198
+ logits = self.net(batch).cpu().numpy().flatten()
199
+
200
+ raw_per_frame = expit(logits)
201
+ per_frame = self._calibrate_batch(raw_per_frame).tolist()
202
+ mean_score = float(self._calibrate(float(expit(np.mean(logits)))))
203
+ return {
204
+ "mean_score": mean_score,
205
+ "per_frame": per_frame,
206
+ "model": f"{self.model_name}_{self.train_db}",
207
+ "error": None,
208
+ "calibrator_applied": self.calibrator_applied,
209
+ }
services/image_service.py CHANGED
@@ -1,8 +1,8 @@
1
  from __future__ import annotations
2
 
3
  import io
4
- from dataclasses import dataclass
5
- from typing import Tuple
6
 
7
  import torch
8
  from loguru import logger
@@ -17,6 +17,8 @@ class ImageClassification:
17
  label: str
18
  confidence: float
19
  all_scores: dict[str, float]
 
 
20
 
21
 
22
  def load_image_from_bytes(data: bytes) -> Image.Image:
@@ -26,8 +28,8 @@ def load_image_from_bytes(data: bytes) -> Image.Image:
26
  return img
27
 
28
 
29
- def classify_image(pil_img: Image.Image) -> ImageClassification:
30
- """Run the ViT deepfake classifier on a PIL image."""
31
  loader = get_model_loader()
32
  model, processor = loader.load_image_model()
33
 
@@ -36,17 +38,88 @@ def classify_image(pil_img: Image.Image) -> ImageClassification:
36
 
37
  with torch.no_grad():
38
  outputs = model(**inputs)
39
- logits = outputs.logits # (1, num_labels)
40
  probs = torch.softmax(logits, dim=-1)[0]
41
 
42
  id2label: dict[int, str] = getattr(model.config, "id2label", {})
43
  all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
44
  top_idx = int(torch.argmax(probs).item())
45
  top_label = id2label.get(top_idx, str(top_idx))
46
- top_conf = float(probs[top_idx].item())
47
 
48
- logger.info(f"Image classify {top_label} @ {top_conf:.3f}")
49
- return ImageClassification(label=top_label, confidence=top_conf, all_scores=all_scores)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:
 
1
  from __future__ import annotations
2
 
3
  import io
4
+ from dataclasses import dataclass, field
5
+ from typing import List, Optional, Tuple
6
 
7
  import torch
8
  from loguru import logger
 
17
  label: str
18
  confidence: float
19
  all_scores: dict[str, float]
20
+ models_used: List[str] = field(default_factory=list)
21
+ ensemble_method: Optional[str] = None
22
 
23
 
24
  def load_image_from_bytes(data: bytes) -> Image.Image:
 
28
  return img
29
 
30
 
31
+ def _classify_vit(pil_img: Image.Image) -> Tuple[float, str, dict[str, float]]:
32
+ """Run the ViT deepfake classifier. Returns (fake_prob, top_label, all_scores)."""
33
  loader = get_model_loader()
34
  model, processor = loader.load_image_model()
35
 
 
38
 
39
  with torch.no_grad():
40
  outputs = model(**inputs)
41
+ logits = outputs.logits
42
  probs = torch.softmax(logits, dim=-1)[0]
43
 
44
  id2label: dict[int, str] = getattr(model.config, "id2label", {})
45
  all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
46
  top_idx = int(torch.argmax(probs).item())
47
  top_label = id2label.get(top_idx, str(top_idx))
 
48
 
49
+ # Identify the fake probability — pick the highest score from fake-labelled classes.
50
+ fake_tokens = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
51
+ fake_prob = max(
52
+ (float(p) for lbl, p in all_scores.items() if any(t in lbl.lower() for t in fake_tokens)),
53
+ default=float(probs[top_idx].item()),
54
+ )
55
+ return fake_prob, top_label, all_scores
56
+
57
+
58
+ def classify_image(pil_img: Image.Image) -> ImageClassification:
59
+ """Run deepfake classification. Uses ensemble (ViT + EfficientNet) when ENSEMBLE_MODE=true,
60
+ falls back to ViT-only when EfficientNet is unavailable or ENSEMBLE_MODE=false.
61
+ """
62
+ vit_fake_prob, vit_label, vit_scores = _classify_vit(pil_img)
63
+ models_used = [settings.IMAGE_MODEL_ID]
64
+
65
+ if not settings.ENSEMBLE_MODE:
66
+ logger.info(f"Image classify (ViT-only) → {vit_label} @ fake_p={vit_fake_prob:.3f}")
67
+ label = "Fake" if vit_fake_prob >= 0.5 else "Real"
68
+ return ImageClassification(
69
+ label=label,
70
+ confidence=vit_fake_prob,
71
+ all_scores=vit_scores,
72
+ models_used=models_used,
73
+ ensemble_method=None,
74
+ )
75
+
76
+ # Attempt EfficientNet inference.
77
+ loader = get_model_loader()
78
+ eff_detector = loader.load_efficientnet()
79
+ if eff_detector is None:
80
+ logger.warning("EfficientNet unavailable — falling back to ViT-only")
81
+ label = "Fake" if vit_fake_prob >= 0.5 else "Real"
82
+ return ImageClassification(
83
+ label=label,
84
+ confidence=vit_fake_prob,
85
+ all_scores=vit_scores,
86
+ models_used=models_used,
87
+ ensemble_method=None,
88
+ )
89
+
90
+ eff_result = eff_detector.detect_image(pil_img)
91
+ if eff_result.get("error") or eff_result.get("score") is None:
92
+ # BlazeFace found no face — trust ViT alone.
93
+ logger.info(f"EfficientNet no-face fallback → using ViT score {vit_fake_prob:.3f}")
94
+ label = "Fake" if vit_fake_prob >= 0.5 else "Real"
95
+ return ImageClassification(
96
+ label=label,
97
+ confidence=vit_fake_prob,
98
+ all_scores=vit_scores,
99
+ models_used=models_used,
100
+ ensemble_method="vit_only_no_face",
101
+ )
102
+
103
+ eff_fake_prob: float = eff_result["score"]
104
+ models_used.append(eff_result["model"])
105
+
106
+ # Simple average ensemble.
107
+ ensemble_prob = (vit_fake_prob + eff_fake_prob) / 2.0
108
+ label = "Fake" if ensemble_prob >= 0.5 else "Real"
109
+ logger.info(
110
+ f"Image classify (ensemble) → {label} | vit={vit_fake_prob:.3f} eff={eff_fake_prob:.3f} avg={ensemble_prob:.3f}"
111
+ )
112
+ return ImageClassification(
113
+ label=label,
114
+ confidence=ensemble_prob,
115
+ all_scores={
116
+ **{f"vit_{k}": v for k, v in vit_scores.items()},
117
+ f"efficientnet_fake": eff_fake_prob,
118
+ f"efficientnet_real": 1.0 - eff_fake_prob,
119
+ },
120
+ models_used=models_used,
121
+ ensemble_method="average",
122
+ )
123
 
124
 
125
  def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:
services/metadata_writer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Optional ExifTool metadata writer — embeds DeepShield verdict into analyzed file metadata.
2
+
3
+ Gated behind EXIFTOOL_PATH env var. Silently skips if ExifTool is not configured.
4
+ Install ExifTool: https://exiftool.org/ — set EXIFTOOL_PATH in .env to enable.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import subprocess
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+ from loguru import logger
13
+
14
+ from config import settings
15
+
16
+
17
+ def _exiftool_path() -> Optional[str]:
18
+ path = getattr(settings, "EXIFTOOL_PATH", "")
19
+ if path and Path(path).is_file():
20
+ return path
21
+ return None
22
+
23
+
24
+ def write_verdict_metadata(
25
+ file_path: str,
26
+ verdict: str,
27
+ authenticity_score: int,
28
+ models_used: list[str],
29
+ analysis_id: str,
30
+ ) -> bool:
31
+ """Embed DeepShield analysis verdict into the file's EXIF/metadata via ExifTool.
32
+
33
+ Returns True if metadata was written, False if ExifTool is not configured or write failed.
34
+ """
35
+ exiftool = _exiftool_path()
36
+ if not exiftool:
37
+ return False
38
+
39
+ comment = (
40
+ f"DeepShield verdict: {verdict} | "
41
+ f"score: {authenticity_score} | "
42
+ f"models: {','.join(models_used)} | "
43
+ f"id: {analysis_id}"
44
+ )
45
+
46
+ try:
47
+ result = subprocess.run(
48
+ [
49
+ exiftool,
50
+ f"-Comment={comment}",
51
+ f"-UserComment={comment}",
52
+ "-overwrite_original",
53
+ file_path,
54
+ ],
55
+ capture_output=True,
56
+ text=True,
57
+ timeout=15,
58
+ )
59
+ if result.returncode == 0:
60
+ logger.info(f"ExifTool wrote verdict metadata to {file_path}")
61
+ return True
62
+ else:
63
+ logger.warning(f"ExifTool failed (rc={result.returncode}): {result.stderr.strip()}")
64
+ return False
65
+ except FileNotFoundError:
66
+ logger.warning(f"ExifTool not found at {exiftool}")
67
+ return False
68
+ except subprocess.TimeoutExpired:
69
+ logger.warning("ExifTool timed out writing metadata")
70
+ return False
71
+ except Exception as e:
72
+ logger.warning(f"ExifTool metadata write failed: {e}")
73
+ return False
services/video_service.py CHANGED
@@ -1,15 +1,16 @@
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass, field
4
- from typing import List, Tuple
5
 
6
  import cv2
7
  import numpy as np
8
  from loguru import logger
9
  from PIL import Image
10
 
 
11
  from models.model_loader import get_model_loader
12
- from services.image_service import classify_image
13
 
14
 
15
  @dataclass
@@ -18,10 +19,10 @@ class FrameAnalysis:
18
  timestamp_s: float
19
  label: str
20
  confidence: float
21
- suspicious_prob: float # prob of the fake/manipulated class
22
  is_suspicious: bool
23
  has_face: bool = False
24
- scored: bool = False # contributed to aggregate (face frames only)
25
 
26
 
27
  @dataclass
@@ -35,6 +36,8 @@ class VideoAggregation:
35
  insufficient_faces: bool
36
  suspicious_timestamps: List[float] = field(default_factory=list)
37
  frames: List[FrameAnalysis] = field(default_factory=list)
 
 
38
 
39
 
40
  FAKE_TOKENS = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
@@ -45,9 +48,9 @@ def _is_fake_label(label: str) -> bool:
45
  return any(tok in l for tok in FAKE_TOKENS)
46
 
47
 
48
- def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, float, Image.Image]]:
49
- """Uniformly sample num_frames frames from the video. Returns list of
50
- (frame_index, timestamp_seconds, PIL.Image).
51
  """
52
  cap = cv2.VideoCapture(video_path)
53
  if not cap.isOpened():
@@ -62,7 +65,7 @@ def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, flo
62
  n = min(num_frames, total)
63
  indices = np.linspace(0, max(0, total - 1), num=n, dtype=int).tolist()
64
 
65
- out: List[Tuple[int, float, Image.Image]] = []
66
  for idx in indices:
67
  cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
68
  ok, frame_bgr = cap.read()
@@ -71,40 +74,97 @@ def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, flo
71
  frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
72
  pil = Image.fromarray(frame_rgb)
73
  ts = (idx / fps) if fps > 0 else 0.0
74
- out.append((int(idx), float(ts), pil))
75
 
76
  cap.release()
77
  logger.info(f"Extracted {len(out)}/{n} frames from video (total={total}, fps={fps:.2f})")
78
  return out
79
 
80
 
81
- MIN_FACE_FRAMES = 3 # below this we refuse to issue a deepfake verdict
82
 
83
 
84
- def _has_face(pil: Image.Image) -> bool:
85
  detector = get_model_loader().load_face_detector()
86
  arr = np.array(pil)
87
  res = detector.process(arr)
88
  return bool(getattr(res, "multi_face_landmarks", None))
89
 
90
 
91
- def classify_frames(frames: List[Tuple[int, float, Image.Image]]) -> List[FrameAnalysis]:
 
 
 
 
 
 
 
 
 
 
92
  results: List[FrameAnalysis] = []
93
- for idx, ts, pil in frames:
94
- face = _has_face(pil)
95
- clf = classify_image(pil)
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  fake_prob = 0.0
97
- for lbl, p in clf.all_scores.items():
98
- if _is_fake_label(lbl):
99
- fake_prob = max(fake_prob, float(p))
 
 
 
 
 
 
 
 
 
 
100
  results.append(
101
  FrameAnalysis(
102
  index=idx,
103
  timestamp_s=ts,
104
- label=clf.label,
105
- confidence=clf.confidence,
106
  suspicious_prob=fake_prob,
107
- is_suspicious=(fake_prob >= 0.5) and face,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  has_face=face,
109
  scored=face,
110
  )
@@ -112,18 +172,20 @@ def classify_frames(frames: List[Tuple[int, float, Image.Image]]) -> List[FrameA
112
  return results
113
 
114
 
115
- def aggregate(frames: List[FrameAnalysis]) -> VideoAggregation:
116
- if not frames:
 
 
 
 
117
  return VideoAggregation(0, 0, 0, 0.0, 0.0, 0.0, True)
118
 
119
- scored = [f for f in frames if f.scored]
120
  num_face = len(scored)
121
  insufficient = num_face < MIN_FACE_FRAMES
122
 
123
  if insufficient:
124
- mean_p = 0.0
125
- max_p = 0.0
126
- susp_ratio = 0.0
127
  susp: List[FrameAnalysis] = []
128
  else:
129
  probs = [f.suspicious_prob for f in scored]
@@ -133,19 +195,28 @@ def aggregate(frames: List[FrameAnalysis]) -> VideoAggregation:
133
  susp_ratio = len(susp) / len(scored)
134
 
135
  return VideoAggregation(
136
- num_frames_sampled=len(frames),
137
  num_face_frames=num_face,
138
- num_suspicious_frames=len(susp),
139
  mean_suspicious_prob=mean_p,
140
  max_suspicious_prob=max_p,
141
  suspicious_ratio=susp_ratio,
142
  insufficient_faces=insufficient,
143
- suspicious_timestamps=[round(f.timestamp_s, 2) for f in susp],
144
- frames=frames,
 
 
145
  )
146
 
147
 
148
  def analyze_video(video_path: str, num_frames: int = 16) -> VideoAggregation:
149
  frames = extract_frames(video_path, num_frames=num_frames)
150
- classified = classify_frames(frames)
151
- return aggregate(classified)
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass, field
4
+ from typing import List, Optional, Tuple
5
 
6
  import cv2
7
  import numpy as np
8
  from loguru import logger
9
  from PIL import Image
10
 
11
+ from config import settings
12
  from models.model_loader import get_model_loader
13
+ from services.image_service import _classify_vit
14
 
15
 
16
  @dataclass
 
19
  timestamp_s: float
20
  label: str
21
  confidence: float
22
+ suspicious_prob: float
23
  is_suspicious: bool
24
  has_face: bool = False
25
+ scored: bool = False
26
 
27
 
28
  @dataclass
 
36
  insufficient_faces: bool
37
  suspicious_timestamps: List[float] = field(default_factory=list)
38
  frames: List[FrameAnalysis] = field(default_factory=list)
39
+ models_used: List[str] = field(default_factory=list)
40
+ face_detector_used: str = "mediapipe"
41
 
42
 
43
  FAKE_TOKENS = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
 
48
  return any(tok in l for tok in FAKE_TOKENS)
49
 
50
 
51
+ def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, float, np.ndarray, Image.Image]]:
52
+ """Uniformly sample num_frames frames from the video.
53
+ Returns list of (frame_index, timestamp_seconds, bgr_numpy, PIL.Image).
54
  """
55
  cap = cv2.VideoCapture(video_path)
56
  if not cap.isOpened():
 
65
  n = min(num_frames, total)
66
  indices = np.linspace(0, max(0, total - 1), num=n, dtype=int).tolist()
67
 
68
+ out: List[Tuple[int, float, np.ndarray, Image.Image]] = []
69
  for idx in indices:
70
  cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
71
  ok, frame_bgr = cap.read()
 
74
  frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
75
  pil = Image.fromarray(frame_rgb)
76
  ts = (idx / fps) if fps > 0 else 0.0
77
+ out.append((int(idx), float(ts), frame_bgr, pil))
78
 
79
  cap.release()
80
  logger.info(f"Extracted {len(out)}/{n} frames from video (total={total}, fps={fps:.2f})")
81
  return out
82
 
83
 
84
+ MIN_FACE_FRAMES = 3
85
 
86
 
87
+ def _has_face_mediapipe(pil: Image.Image) -> bool:
88
  detector = get_model_loader().load_face_detector()
89
  arr = np.array(pil)
90
  res = detector.process(arr)
91
  return bool(getattr(res, "multi_face_landmarks", None))
92
 
93
 
94
+ def _analyze_with_efficientnet(
95
+ frames: List[Tuple[int, float, np.ndarray, Image.Image]],
96
+ ) -> Tuple[List[FrameAnalysis], str, List[str]]:
97
+ """Primary path: use EfficientNet + BlazeFace per-frame. Returns (frame_results, detector_used, models_used)."""
98
+ loader = get_model_loader()
99
+ eff = loader.load_efficientnet()
100
+
101
+ if eff is None:
102
+ logger.warning("EfficientNet unavailable — falling back to ViT video pipeline")
103
+ return _analyze_with_vit(frames), "mediapipe", [settings.IMAGE_MODEL_ID]
104
+
105
  results: List[FrameAnalysis] = []
106
+ face_detector_used = "blazeface"
107
+ models_used = [f"{settings.EFFICIENTNET_MODEL}_{settings.EFFICIENTNET_TRAIN_DB}"]
108
+
109
+ for idx, ts, frame_bgr, pil in frames:
110
+ # Pass RGB to EfficientNet (process_image expects RGB array).
111
+ frame_rgb = frame_bgr[..., ::-1].copy()
112
+ frame_data = eff.face_extractor.process_image(img=frame_rgb)
113
+ faces: list = frame_data.get("faces", [])
114
+ has_face = bool(faces)
115
+
116
+ if not has_face:
117
+ # Fallback: check MediaPipe so we don't silently miss faces.
118
+ has_face = _has_face_mediapipe(pil)
119
+ if has_face:
120
+ face_detector_used = "blazeface+mediapipe_fallback"
121
+
122
  fake_prob = 0.0
123
+ label = "unknown"
124
+ if has_face and faces:
125
+ # Run EfficientNet on the best face from BlazeFace.
126
+ face_t = eff._face_tensor(faces[0])
127
+ import torch
128
+ with torch.inference_mode():
129
+ logit = eff.net(face_t.unsqueeze(0).to(eff.device))
130
+ from scipy.special import expit
131
+ fake_prob = float(expit(logit.cpu().numpy().item()))
132
+ label = "Fake" if fake_prob > 0.5 else "Real"
133
+ elif not has_face:
134
+ label = "no_face"
135
+
136
  results.append(
137
  FrameAnalysis(
138
  index=idx,
139
  timestamp_s=ts,
140
+ label=label,
141
+ confidence=fake_prob,
142
  suspicious_prob=fake_prob,
143
+ is_suspicious=(fake_prob >= 0.5) and has_face,
144
+ has_face=has_face,
145
+ scored=has_face,
146
+ )
147
+ )
148
+
149
+ return results, face_detector_used, models_used
150
+
151
+
152
+ def _analyze_with_vit(
153
+ frames: List[Tuple[int, float, np.ndarray, Image.Image]],
154
+ ) -> List[FrameAnalysis]:
155
+ """Fallback: original ViT-per-frame pipeline (MediaPipe face gate)."""
156
+ results: List[FrameAnalysis] = []
157
+ for idx, ts, _bgr, pil in frames:
158
+ face = _has_face_mediapipe(pil)
159
+ vit_fake_prob, vit_label, _ = _classify_vit(pil)
160
+ results.append(
161
+ FrameAnalysis(
162
+ index=idx,
163
+ timestamp_s=ts,
164
+ label=vit_label,
165
+ confidence=vit_fake_prob,
166
+ suspicious_prob=vit_fake_prob,
167
+ is_suspicious=(vit_fake_prob >= 0.5) and face,
168
  has_face=face,
169
  scored=face,
170
  )
 
172
  return results
173
 
174
 
175
+ def aggregate(
176
+ frame_results: List[FrameAnalysis],
177
+ models_used: Optional[List[str]] = None,
178
+ face_detector_used: str = "mediapipe",
179
+ ) -> VideoAggregation:
180
+ if not frame_results:
181
  return VideoAggregation(0, 0, 0, 0.0, 0.0, 0.0, True)
182
 
183
+ scored = [f for f in frame_results if f.scored]
184
  num_face = len(scored)
185
  insufficient = num_face < MIN_FACE_FRAMES
186
 
187
  if insufficient:
188
+ mean_p, max_p, susp_ratio = 0.0, 0.0, 0.0
 
 
189
  susp: List[FrameAnalysis] = []
190
  else:
191
  probs = [f.suspicious_prob for f in scored]
 
195
  susp_ratio = len(susp) / len(scored)
196
 
197
  return VideoAggregation(
198
+ num_frames_sampled=len(frame_results),
199
  num_face_frames=num_face,
200
+ num_suspicious_frames=len(susp) if not insufficient else 0,
201
  mean_suspicious_prob=mean_p,
202
  max_suspicious_prob=max_p,
203
  suspicious_ratio=susp_ratio,
204
  insufficient_faces=insufficient,
205
+ suspicious_timestamps=[round(f.timestamp_s, 2) for f in (susp if not insufficient else [])],
206
+ frames=frame_results,
207
+ models_used=models_used or [settings.IMAGE_MODEL_ID],
208
+ face_detector_used=face_detector_used,
209
  )
210
 
211
 
212
  def analyze_video(video_path: str, num_frames: int = 16) -> VideoAggregation:
213
  frames = extract_frames(video_path, num_frames=num_frames)
214
+
215
+ if settings.ENSEMBLE_MODE:
216
+ frame_results, face_detector_used, models_used = _analyze_with_efficientnet(frames)
217
+ else:
218
+ frame_results = _analyze_with_vit(frames)
219
+ face_detector_used = "mediapipe"
220
+ models_used = [settings.IMAGE_MODEL_ID]
221
+
222
+ return aggregate(frame_results, models_used=models_used, face_detector_used=face_detector_used)
v1/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (165 Bytes)
 
v1/__pycache__/analyze.cpython-311.pyc DELETED
Binary file (21.6 kB)
 
v1/__pycache__/auth.cpython-311.pyc DELETED
Binary file (3.82 kB)
 
v1/__pycache__/health.cpython-311.pyc DELETED
Binary file (556 Bytes)
 
v1/__pycache__/history.cpython-311.pyc DELETED
Binary file (5.19 kB)
 
v1/__pycache__/report.cpython-311.pyc DELETED
Binary file (4.29 kB)
 
v1/analyze.py CHANGED
@@ -55,6 +55,7 @@ from services.text_service import (
55
  score_sensationalism,
56
  )
57
  from services.video_service import analyze_video
 
58
  from utils.file_handler import read_upload_bytes, save_upload_to_tempfile
59
  from utils.scoring import compute_authenticity_score, get_verdict_label
60
 
@@ -89,7 +90,10 @@ async def analyze_image(
89
  heatmap_status = "success"
90
  heatmap = ""
91
  try:
92
- heatmap = generate_heatmap_base64(pil)
 
 
 
93
  stages.append("heatmap_generation")
94
  except Exception as e: # noqa: BLE001
95
  logger.warning(f"Heatmap generation failed, continuing: {e}")
@@ -155,6 +159,7 @@ async def analyze_image(
155
  stages_completed=stages,
156
  total_duration_ms=duration_ms,
157
  model_used=settings.IMAGE_MODEL_ID,
 
158
  ),
159
  )
160
 
@@ -218,11 +223,12 @@ async def analyze_video_endpoint(
218
  stages.append("frame_extraction")
219
  stages.append("frame_classification")
220
  stages.append("aggregation")
221
- finally:
222
  try:
223
  os.unlink(path)
224
  except OSError:
225
  pass
 
226
 
227
  if agg.insufficient_faces:
228
  score = 50
@@ -271,6 +277,7 @@ async def analyze_video_endpoint(
271
  stages_completed=stages,
272
  total_duration_ms=duration_ms,
273
  model_used=settings.IMAGE_MODEL_ID,
 
274
  ),
275
  )
276
 
@@ -290,6 +297,23 @@ async def analyze_video_endpoint(
290
  f"frames={agg.num_frames_sampled} susp={agg.num_suspicious_frames}"
291
  )
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  # Phase 12: LLM explainability card
294
  try:
295
  response.llm_summary = generate_llm_summary(
 
55
  score_sensationalism,
56
  )
57
  from services.video_service import analyze_video
58
+ from services.metadata_writer import write_verdict_metadata
59
  from utils.file_handler import read_upload_bytes, save_upload_to_tempfile
60
  from utils.scoring import compute_authenticity_score, get_verdict_label
61
 
 
90
  heatmap_status = "success"
91
  heatmap = ""
92
  try:
93
+ model_family = "efficientnet" if settings.ENSEMBLE_MODE else "vit"
94
+ heatmap, heatmap_source = generate_heatmap_base64(pil, model_family=model_family)
95
+ if not heatmap:
96
+ heatmap_status = heatmap_source # "none" or "fallback"
97
  stages.append("heatmap_generation")
98
  except Exception as e: # noqa: BLE001
99
  logger.warning(f"Heatmap generation failed, continuing: {e}")
 
159
  stages_completed=stages,
160
  total_duration_ms=duration_ms,
161
  model_used=settings.IMAGE_MODEL_ID,
162
+ models_used=clf.models_used,
163
  ),
164
  )
165
 
 
223
  stages.append("frame_extraction")
224
  stages.append("frame_classification")
225
  stages.append("aggregation")
226
+ except Exception:
227
  try:
228
  os.unlink(path)
229
  except OSError:
230
  pass
231
+ raise
232
 
233
  if agg.insufficient_faces:
234
  score = 50
 
277
  stages_completed=stages,
278
  total_duration_ms=duration_ms,
279
  model_used=settings.IMAGE_MODEL_ID,
280
+ models_used=agg.models_used,
281
  ),
282
  )
283
 
 
297
  f"frames={agg.num_frames_sampled} susp={agg.num_suspicious_frames}"
298
  )
299
 
300
+ # Write verdict into video metadata (ExifTool, optional — gated by EXIFTOOL_PATH).
301
+ try:
302
+ write_verdict_metadata(
303
+ file_path=path,
304
+ verdict=label,
305
+ authenticity_score=score,
306
+ models_used=agg.models_used,
307
+ analysis_id=str(record.id),
308
+ )
309
+ except Exception as e: # noqa: BLE001
310
+ logger.warning(f"Metadata write failed: {e}")
311
+ finally:
312
+ try:
313
+ os.unlink(path)
314
+ except OSError:
315
+ pass
316
+
317
  # Phase 12: LLM explainability card
318
  try:
319
  response.llm_summary = generate_llm_summary(