Spaces:

ar07xd
/

deepshield

Runtime error

App Files Files Community

Spyderzz commited on Apr 20

Commit

3909c31

1 Parent(s): 4ef8b6a

feat: merge DeepShield1 EfficientNet ensemble into production deployment

Browse files

Files changed (45) hide show

config.py +7 -0
models/heatmap_generator.py +75 -6
models/icpr2020dfdc/.gitignore +5 -0
models/icpr2020dfdc/.travis.yml +15 -0
models/icpr2020dfdc/LICENSE +674 -0
models/icpr2020dfdc/README.md +120 -0
models/icpr2020dfdc/architectures/__init__.py +0 -0
models/icpr2020dfdc/architectures/externals/__init__.py +1 -0
models/icpr2020dfdc/architectures/externals/xception.py +236 -0
models/icpr2020dfdc/architectures/fornet.py +245 -0
models/icpr2020dfdc/architectures/tripletnet.py +44 -0
models/icpr2020dfdc/architectures/weights.py +24 -0
models/icpr2020dfdc/blazeface/__init__.py +3 -0
models/icpr2020dfdc/blazeface/anchors.npy +3 -0
models/icpr2020dfdc/blazeface/blazeface.pth +3 -0
models/icpr2020dfdc/blazeface/blazeface.py +417 -0
models/icpr2020dfdc/blazeface/face_extract.py +470 -0
models/icpr2020dfdc/blazeface/read_video.py +213 -0
models/icpr2020dfdc/environment.yml +25 -0
models/icpr2020dfdc/extract_faces.py +346 -0
models/icpr2020dfdc/index_celebdf.py +85 -0
models/icpr2020dfdc/index_dfdc.py +94 -0
models/icpr2020dfdc/index_ffpp.py +92 -0
models/icpr2020dfdc/isplutils/__init__.py +0 -0
models/icpr2020dfdc/isplutils/data.py +263 -0
models/icpr2020dfdc/isplutils/data_siamese.py +78 -0
models/icpr2020dfdc/isplutils/split.py +135 -0
models/icpr2020dfdc/isplutils/utils.py +247 -0
models/icpr2020dfdc/test_model.py +270 -0
models/icpr2020dfdc/train_binclass.py +460 -0
models/icpr2020dfdc/train_triplet.py +459 -0
models/model_loader.py +18 -0
requirements.txt +7 -0
schemas/common.py +1 -0
services/efficientnet_service.py +209 -0
services/image_service.py +81 -8
services/metadata_writer.py +73 -0
services/video_service.py +104 -33
v1/__pycache__/__init__.cpython-311.pyc +0 -0
v1/__pycache__/analyze.cpython-311.pyc +0 -0
v1/__pycache__/auth.cpython-311.pyc +0 -0
v1/__pycache__/health.cpython-311.pyc +0 -0
v1/__pycache__/history.cpython-311.pyc +0 -0
v1/__pycache__/report.cpython-311.pyc +0 -0
v1/analyze.py +26 -2

config.py CHANGED Viewed

@@ -42,6 +42,13 @@ class Settings(BaseSettings):
     LLM_API_KEY: str = ""
     LLM_MODEL: str = "gemini-2.5-pro"  # or "gpt-4o"
     # Auth
     JWT_SECRET_KEY: str = "change-me-in-production"
     JWT_ALGORITHM: str = "HS256"

     LLM_API_KEY: str = ""
     LLM_MODEL: str = "gemini-2.5-pro"  # or "gpt-4o"
+    # EfficientNet (ICPR2020 / DeepShield1 merge)
+    EFFICIENTNET_MODEL: str = "EfficientNetAutoAttB4"
+    EFFICIENTNET_TRAIN_DB: str = "DFDC"
+    ENSEMBLE_MODE: bool = True  # run both ViT + EfficientNet and average scores
+    VIDEO_SAMPLE_FRAMES: int = 16  # frames to sample per video for inference
+    EXIFTOOL_PATH: str = ""  # full path to ExifTool binary; empty = metadata write disabled
     # Auth
     JWT_SECRET_KEY: str = "change-me-in-production"
     JWT_ALGORITHM: str = "HS256"

models/heatmap_generator.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import base64
 import io
-from typing import Optional
 import cv2
 import numpy as np
@@ -107,15 +107,84 @@ def _compute_gradcam_pp(
     return grayscale_cam, rgb_float
 def generate_heatmap_base64(
     pil_img: Image.Image,
     target_class_idx: Optional[int] = None,
-) -> str:
-    """Produce a base64 data-URL PNG of the Grad-CAM++ overlay for the given image."""
-    grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
     overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
-    logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]})")
-    return _encode_overlay_to_base64(overlay)
 def generate_boxes_base64(

 import base64
 import io
+from typing import Literal, Optional
 import cv2
 import numpy as np
     return grayscale_cam, rgb_float
+def _compute_gradcam_pp_efficientnet(
+    pil_img: Image.Image,
+) -> tuple[np.ndarray, np.ndarray, Literal["attention", "gradcam++"]]:
+    """Grad-CAM++ for EfficientNetAutoAttB4.
+    Returns (grayscale_cam, rgb_float, heatmap_source).
+    Prefers the model's built-in attention map; falls back to Grad-CAM++ on the
+    last MBConv block if attention extraction fails.
+    """
+    loader = get_model_loader()
+    eff = loader.load_efficientnet()
+    if eff is None:
+        raise RuntimeError("EfficientNet not loaded")
+    if pil_img.mode != "RGB":
+        pil_img = pil_img.convert("RGB")
+    img_np = np.array(pil_img)
+    # Prepare face crop (same path as detect_image).
+    frame_data = eff.face_extractor.process_image(img=img_np)
+    faces: list = frame_data.get("faces", [])
+    if not faces:
+        raise ValueError("no_face")
+    face_t = eff._face_tensor(faces[0]).unsqueeze(0).to(eff.device)
+    # Resize the face crop to float [0,1] for overlay.
+    face_np = faces[0]
+    h, w = face_np.shape[:2]
+    rgb_float = face_np.astype(np.float32) / 255.0
+    if rgb_float.shape[:2] != (224, 224):
+        rgb_float = cv2.resize(rgb_float, (224, 224)).astype(np.float32)
+    # Try Grad-CAM++ on last MBConv block (_blocks[-1]).
+    try:
+        net = eff.net
+        target_layers = [net.efficientnet._blocks[-1]]
+        face_t.requires_grad_(True)
+        for p in net.parameters():
+            p.requires_grad_(True)
+        with GradCAMPlusPlus(model=net, target_layers=target_layers) as cam:
+            grayscale_cam = cam(input_tensor=face_t, targets=None)[0]
+        return grayscale_cam, rgb_float, "gradcam++"
+    except Exception as e:
+        logger.warning(f"EfficientNet Grad-CAM++ failed ({e}), using uniform fallback")
+        grayscale_cam = np.ones((224, 224), dtype=np.float32) * 0.5
+        return grayscale_cam, rgb_float, "gradcam++"
 def generate_heatmap_base64(
     pil_img: Image.Image,
     target_class_idx: Optional[int] = None,
+    model_family: Literal["vit", "efficientnet"] = "vit",
+) -> tuple[str, str]:
+    """Produce a base64 data-URL PNG of the Grad-CAM++ overlay.
+    Returns (base64_png, heatmap_source) where heatmap_source is one of
+    "gradcam++", "attention", "fallback", "none".
+    """
+    if model_family == "efficientnet":
+        try:
+            grayscale_cam, rgb_float, source = _compute_gradcam_pp_efficientnet(pil_img)
+        except ValueError:
+            logger.info("EfficientNet heatmap skipped — no face detected")
+            return "", "none"
+        except Exception as e:
+            logger.warning(f"EfficientNet heatmap failed: {e}")
+            return "", "fallback"
+    else:
+        grayscale_cam, rgb_float = _compute_gradcam_pp(pil_img, target_class_idx)
+        source = "gradcam++"
     overlay = show_cam_on_image(rgb_float, grayscale_cam, use_rgb=True)
+    logger.info(f"Heatmap generated ({overlay.shape[0]}x{overlay.shape[1]}) source={source}")
+    return _encode_overlay_to_base64(overlay), source
 def generate_boxes_base64(

models/icpr2020dfdc/.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.idea/
+.DS_Store
+.ipynb_checkpoints/
+__pycache__/

models/icpr2020dfdc/.travis.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+language: python
+python:
+  - "3.6.9"
+install:
+  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/miniconda.sh
+  - bash $HOME/miniconda.sh -bfp $HOME/miniconda3
+  - export PATH=$HOME/miniconda3/bin:$PATH
+  - conda env create -f environment.yml
+before_script:
+  - source activate icpr2020
+  - cd test
+script:
+  - python -m unittest test_dfdc.TestDFDC
+  - python -m unittest test_ffpp.TestFFPP

models/icpr2020dfdc/LICENSE ADDED Viewed

	@@ -0,0 +1,674 @@

+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Use with the GNU Affero General Public License.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.

models/icpr2020dfdc/README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+# Video Face Manipulation Detection Through Ensemble of CNNs
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/video-face-manipulation-detection-through/deepfake-detection-on-dfdc)](https://paperswithcode.com/sota/deepfake-detection-on-dfdc?p=video-face-manipulation-detection-through)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/video-face-manipulation-detection-through/deepfake-detection-on-faceforensics-1)](https://paperswithcode.com/sota/deepfake-detection-on-faceforensics-1?p=video-face-manipulation-detection-through)
+[![Build Status](https://travis-ci.org/polimi-ispl/icpr2020dfdc.svg?branch=master)](https://travis-ci.org/polimi-ispl/icpr2020dfdc)
+![](assets/faces_attention.png)
+<p align='center'>
+  <img src='assets/mqzvfufzoq_face.gif'/>
+  <img src='assets/mqzvfufzoq_face_att.gif'/>
+</p>
+This is the official repository of **Video Face Manipulation Detection Through Ensemble of CNNs**,
+presented at [ICPR2020](https://www.micc.unifi.it/icpr2020/) and currently available on [IEEExplore](https://ieeexplore.ieee.org/document/9412711) and [arXiv](https://arxiv.org/abs/2004.07676).
+If you use this repository for your research, please consider citing our paper. Refer to [How to cite](https://github.com/polimi-ispl/icpr2020dfdc#how-to-cite) section to get the correct entry for your bibliography.
+We participated as the **ISPL** team in the [Kaggle Deepfake Detection Challenge](https://www.kaggle.com/c/deepfake-detection-challenge/).
+With this implementation, we reached the 41st position over 2116 teams (**top 2%**) on the [private leaderboard](https://www.kaggle.com/c/deepfake-detection-challenge/leaderboard).
+This repository is currently under maintenance, if you are experiencing any problems, please open an [issue](https://github.com/polimi-ispl/icpr2020dfdc/issues).
+## Getting started
+### Prerequisites
+- Install [conda](https://docs.conda.io/en/latest/miniconda.html)
+- Create the `icpr2020` environment with *environment.yml*
+```bash
+$ conda env create -f environment.yml
+$ conda activate icpr2020
+```
+- Download and unzip the [datasets](#datasets)
+### Quick run
+If you just want to test the pre-trained models against your own videos or images:
+- [Video prediction notebook](https://github.com/polimi-ispl/icpr2020dfdc/blob/master/notebook/Video%20prediction.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/12WnvmerHBNbJ49HdoH1lli_O8SwaFPjv?usp=sharing">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg">
+</a>
+- [Image prediction notebook](https://github.com/polimi-ispl/icpr2020dfdc/blob/master/notebook/Image%20prediction.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/19oVKlzEr58VZfRnSq-nW8kFYuxkh3GM8?usp=sharing">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg">
+</a>
+- [Image prediction with attention](notebook/Image%20prediction%20and%20attention.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/1zcglis2Qx2vtJhrogn8aKA-mbUotLZLK?usp=sharing">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg">
+</a>
+### The whole pipeline
+You need to preprocess the datasets in order to index all the samples and extract faces. Just run the script [make_dataset.sh](scripts/make_dataset.sh)
+```bash
+$ ./scripts/make_dataset.sh
+```
+Please note that we use only 32 frames per video. You can easily tweak this parameter in [extract_faces.py](extract_faces.py)
+Also, please note that **for the DFDC** we have resorted to _the training split_ exclusively!
+In `scripts/make_dataset.sh` the value of `DFDC_SRC` should point to the directory containing the DFDC train split.
+### Celeb-DF (v2)
+Altough **we did not use this dataset in the paper**, we provide a script [index_celebdf.py](index_celebdf.py) to index the videos similarly to
+DFDC and FF++. Once you have the index, you can proceed with the pipeline starting from [extract_faces.py](extract_faces.py). You can also use the
+split `celebdf` during training/testing.
+### Train
+In [train_all.sh](scripts/train_all.sh) you can find a comprehensive list of all the commands to train the models presented in the paper.
+Please refer to the comments in the script for hints on their usage.
+#### Training a single model
+If you want to train some models without lunching the script:
+- for the **non-siamese** architectures (e.g. EfficientNetB4, EfficientNetB4Att), you can simply specify the model in [train_binclass.py](train_binclass.py) with the *--net* parameter;
+- for the **siamese** architectures (e.g. EfficientNetB4ST, EfficientNetB4AttST), you have to:
+  1. train the architecture as a feature extractor first, using the [train_triplet.py](train_triplet.py) script and being careful of specifying its name with the *--net* parameter **without** the ST suffix. For instance, for training the EfficientNetB4ST you will have to first run `python train_triplet.py --net EfficientNetB4 --otherparams`;
+  2. finetune the model using [train_binclass.py](train_binclass.py), being careful this time to specify the architecture's name **with** the ST suffix and to insert as *--init* argument the path to the weights of the feature extractor trained at the previous step. You will end up running something like `python train_binclass.py --net EfficientNetB4ST --init path/to/EfficientNetB4/weights/trained/with/train_triplet/weights.pth --otherparams`
+### Test
+In [test_all.sh](scripts/test_all.sh) you can find a comprehensive list of all the commands for testing the models presented in the paper.
+#### Pretrained weights
+We also provide pretrained weights for all the architectures presented in the paper.
+Please refer to this [Dropbox link](https://www.dropbox.com/sh/cesamx5ytd5j08c/AADG_eEmhskliMaT0Gbk-yHDa?dl=0).
+Each directory is named `$NETWORK_$DATASET` where `$NETWORK` is the architecture name and `$DATASET` is the training dataset.
+In each directory, you can find `bestval.pth` which are the best network weights according to the validation set.
+Additionally, you can find Jupyter notebooks for results computations in the [notebook](notebook) folder.
+## Datasets
+- [Facebook's DeepFake Detection Challenge (DFDC) train dataset](https://www.kaggle.com/c/deepfake-detection-challenge/data) | [arXiv paper](https://arxiv.org/abs/2006.07397)
+- [FaceForensics++](https://github.com/ondyari/FaceForensics/blob/master/dataset/README.md) | [arXiv paper](https://arxiv.org/abs/1901.08971)
+- [Celeb-DF (v2)](http://www.cs.albany.edu/~lsw/celeb-deepfakeforensics.html) | [arXiv paper](https://arxiv.org/abs/1909.12962) (**Just for reference, not used in the paper**)
+## References
+- [EfficientNet PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch)
+- [Xception PyTorch](https://github.com/tstandley/Xception-PyTorch)
+## How to cite
+Plain text:
+```
+N. Bonettini, E. D. Cannas, S. Mandelli, L. Bondi, P. Bestagini and S. Tubaro, "Video Face Manipulation Detection Through Ensemble of CNNs," 2020 25th International Conference on Pattern Recognition (ICPR), 2021, pp. 5012-5019, doi: 10.1109/ICPR48806.2021.9412711.
+```
+Bibtex:
+```bibtex
+@INPROCEEDINGS{9412711,
+  author={Bonettini, Nicolò and Cannas, Edoardo Daniele and Mandelli, Sara and Bondi, Luca and Bestagini, Paolo and Tubaro, Stefano},
+  booktitle={2020 25th International Conference on Pattern Recognition (ICPR)},
+  title={Video Face Manipulation Detection Through Ensemble of CNNs},
+  year={2021},
+  volume={},
+  number={},
+  pages={5012-5019},
+  doi={10.1109/ICPR48806.2021.9412711}}
+```
+## Credits
+[Image and Sound Processing Lab - Politecnico di Milano](http://ispl.deib.polimi.it/)
+- Nicolò Bonettini
+- Edoardo Daniele Cannas
+- Sara Mandelli
+- Luca Bondi
+- Paolo Bestagini

models/icpr2020dfdc/architectures/__init__.py ADDED Viewed

File without changes

models/icpr2020dfdc/architectures/externals/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .xception import xception

models/icpr2020dfdc/architectures/externals/xception.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch)
+@author: tstandley
+Adapted by cadene
+Creates an Xception Model as defined in:
+Francois Chollet
+Xception: Deep Learning with Depthwise Separable Convolutions
+https://arxiv.org/pdf/1610.02357.pdf
+This weights ported from the Keras implementation. Achieves the following performance on the validation set:
+Loss:0.9173 Prec@1:78.892 Prec@5:94.292
+REMEMBER to set your image size to 3x299x299 for both test and validation
+normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
+                                  std=[0.5, 0.5, 0.5])
+The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+"""
+from __future__ import print_function, division, absolute_import
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+__all__ = ['xception']
+pretrained_settings = {
+    'xception': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 299, 299],
+            'input_range': [0, 1],
+            'mean': [0.5, 0.5, 0.5],
+            'std': [0.5, 0.5, 0.5],
+            'num_classes': 1000,
+            'scale': 0.8975
+            # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+        }
+    }
+}
+class SeparableConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
+        super(SeparableConv2d, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride, padding, dilation, groups=in_channels,
+                               bias=bias)
+        self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=bias)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pointwise(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, in_filters, out_filters, reps, strides=1, start_with_relu=True, grow_first=True):
+        super(Block, self).__init__()
+        if out_filters != in_filters or strides != 1:
+            self.skip = nn.Conv2d(in_filters, out_filters, 1, stride=strides, bias=False)
+            self.skipbn = nn.BatchNorm2d(out_filters)
+        else:
+            self.skip = None
+        rep = []
+        filters = in_filters
+        if grow_first:
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(in_filters, out_filters, 3, stride=1, padding=1, bias=False))
+            rep.append(nn.BatchNorm2d(out_filters))
+            filters = out_filters
+        for i in range(reps - 1):
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(filters, filters, 3, stride=1, padding=1, bias=False))
+            rep.append(nn.BatchNorm2d(filters))
+        if not grow_first:
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(in_filters, out_filters, 3, stride=1, padding=1, bias=False))
+            rep.append(nn.BatchNorm2d(out_filters))
+        if not start_with_relu:
+            rep = rep[1:]
+        else:
+            rep[0] = nn.ReLU(inplace=False)
+        if strides != 1:
+            rep.append(nn.MaxPool2d(3, strides, 1))
+        self.rep = nn.Sequential(*rep)
+    def forward(self, inp):
+        x = self.rep(inp)
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+        x += skip
+        return x
+class Xception(nn.Module):
+    """
+    Xception optimized for the ImageNet dataset, as specified in
+    https://arxiv.org/pdf/1610.02357.pdf
+    """
+    def __init__(self, num_classes=1000):
+        """ Constructor
+        Args:
+            num_classes: number of classes
+        """
+        super(Xception, self).__init__()
+        self.num_classes = num_classes
+        self.conv1 = nn.Conv2d(3, 32, 3, 2, 0, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(32, 64, 3, bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        # do relu here
+        self.block1 = Block(64, 128, 2, 2, start_with_relu=False, grow_first=True)
+        self.block2 = Block(128, 256, 2, 2, start_with_relu=True, grow_first=True)
+        self.block3 = Block(256, 728, 2, 2, start_with_relu=True, grow_first=True)
+        self.block4 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block5 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block6 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block7 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block8 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block9 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block10 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block11 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block12 = Block(728, 1024, 2, 2, start_with_relu=True, grow_first=False)
+        self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1)
+        self.bn3 = nn.BatchNorm2d(1536)
+        self.relu3 = nn.ReLU(inplace=True)
+        # do relu here
+        self.conv4 = SeparableConv2d(1536, 2048, 3, 1, 1)
+        self.bn4 = nn.BatchNorm2d(2048)
+        self.fc = nn.Linear(2048, num_classes)
+        # #------- init weights --------
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+        # #-----------------------------
+    def features(self, input):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu3(x)
+        x = self.conv4(x)
+        x = self.bn4(x)
+        return x
+    def logits(self, features):
+        x = nn.ReLU(inplace=True)(features)
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+def xception(num_classes=1000, pretrained='imagenet'):
+    model = Xception(num_classes=num_classes)
+    if pretrained:
+        settings = pretrained_settings['xception'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+        model = Xception(num_classes=num_classes)
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    # TODO: ugly
+    model.last_linear = model.fc
+    del model.fc
+    return model

models/icpr2020dfdc/architectures/fornet.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+from collections import OrderedDict
+import torch
+from efficientnet_pytorch import EfficientNet
+from torch import nn as nn
+from torch.nn import functional as F
+from torchvision import transforms
+from . import externals
+"""
+Feature Extractor
+"""
+class FeatureExtractor(nn.Module):
+    """
+    Abstract class to be extended when supporting features extraction.
+    It also provides standard normalized and parameters
+    """
+    def features(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    def get_trainable_parameters(self):
+        return self.parameters()
+    @staticmethod
+    def get_normalizer():
+        return transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+"""
+EfficientNet
+"""
+class EfficientNetGen(FeatureExtractor):
+    def __init__(self, model: str):
+        super(EfficientNetGen, self).__init__()
+        self.efficientnet = EfficientNet.from_pretrained(model)
+        self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
+        del self.efficientnet._fc
+    def features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.efficientnet.extract_features(x)
+        x = self.efficientnet._avg_pooling(x)
+        x = x.flatten(start_dim=1)
+        return x
+    def forward(self, x):
+        x = self.features(x)
+        x = self.efficientnet._dropout(x)
+        x = self.classifier(x)
+        return x
+class EfficientNetB4(EfficientNetGen):
+    def __init__(self):
+        super(EfficientNetB4, self).__init__(model='efficientnet-b4')
+"""
+EfficientNetAutoAtt
+"""
+class EfficientNetAutoAtt(EfficientNet):
+    def init_att(self, model: str, width: int):
+        """
+        Initialize attention
+        :param model: efficientnet-bx, x \in {0,..,7}
+        :param depth: attention width
+        :return:
+        """
+        if model == 'efficientnet-b4':
+            self.att_block_idx = 9
+            if width == 0:
+                self.attconv = nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)
+            else:
+                attconv_layers = []
+                for i in range(width):
+                    attconv_layers.append(
+                        ('conv{:d}'.format(i), nn.Conv2d(kernel_size=3, padding=1, in_channels=56, out_channels=56)))
+                    attconv_layers.append(
+                        ('relu{:d}'.format(i), nn.ReLU(inplace=True)))
+                attconv_layers.append(('conv_out', nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)))
+                self.attconv = nn.Sequential(OrderedDict(attconv_layers))
+        else:
+            raise ValueError('Model not valid: {}'.format(model))
+    def get_attention(self, x: torch.Tensor) -> torch.Tensor:
+        # Placeholder
+        att = None
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(x)))
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if idx == self.att_block_idx:
+                att = torch.sigmoid(self.attconv(x))
+                break
+        return att
+    def extract_features(self, x: torch.Tensor) -> torch.Tensor:
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(x)))
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if idx == self.att_block_idx:
+                att = torch.sigmoid(self.attconv(x))
+                x = x * att
+        # Head
+        x = self._swish(self._bn1(self._conv_head(x)))
+        return x
+class EfficientNetGenAutoAtt(FeatureExtractor):
+    def __init__(self, model: str, width: int):
+        super(EfficientNetGenAutoAtt, self).__init__()
+        self.efficientnet = EfficientNetAutoAtt.from_pretrained(model)
+        self.efficientnet.init_att(model, width)
+        self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
+        del self.efficientnet._fc
+    def features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.efficientnet.extract_features(x)
+        x = self.efficientnet._avg_pooling(x)
+        x = x.flatten(start_dim=1)
+        return x
+    def forward(self, x):
+        x = self.features(x)
+        x = self.efficientnet._dropout(x)
+        x = self.classifier(x)
+        return x
+    def get_attention(self, x: torch.Tensor) -> torch.Tensor:
+        return self.efficientnet.get_attention(x)
+class EfficientNetAutoAttB4(EfficientNetGenAutoAtt):
+    def __init__(self):
+        super(EfficientNetAutoAttB4, self).__init__(model='efficientnet-b4', width=0)
+"""
+Xception
+"""
+class Xception(FeatureExtractor):
+    def __init__(self):
+        super(Xception, self).__init__()
+        self.xception = externals.xception()
+        self.xception.last_linear = nn.Linear(2048, 1)
+    def features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.xception.features(x)
+        x = nn.ReLU(inplace=True)(x)
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        x = x.view(x.size(0), -1)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.xception.forward(x)
+"""
+Siamese tuning
+"""
+class SiameseTuning(FeatureExtractor):
+    def __init__(self, feat_ext: FeatureExtractor, num_feat: int, lastonly: bool = True):
+        super(SiameseTuning, self).__init__()
+        self.feat_ext = feat_ext()
+        if not hasattr(self.feat_ext, 'features'):
+            raise NotImplementedError('The provided feature extractor needs to provide a features() method')
+        self.lastonly = lastonly
+        self.classifier = nn.Sequential(
+            nn.BatchNorm1d(num_features=num_feat),
+            nn.Linear(in_features=num_feat, out_features=1),
+        )
+    def features(self, x):
+        x = self.feat_ext.features(x)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.lastonly:
+            with torch.no_grad():
+                x = self.features(x)
+        else:
+            x = self.features(x)
+        x = self.classifier(x)
+        return x
+    def get_trainable_parameters(self):
+        if self.lastonly:
+            return self.classifier.parameters()
+        else:
+            return self.parameters()
+class EfficientNetB4ST(SiameseTuning):
+    def __init__(self):
+        super(EfficientNetB4ST, self).__init__(feat_ext=EfficientNetB4, num_feat=1792, lastonly=True)
+class EfficientNetAutoAttB4ST(SiameseTuning):
+    def __init__(self):
+        super(EfficientNetAutoAttB4ST, self).__init__(feat_ext=EfficientNetAutoAttB4, num_feat=1792, lastonly=True)
+class XceptionST(SiameseTuning):
+    def __init__(self):
+        super(XceptionST, self).__init__(feat_ext=Xception, num_feat=2048, lastonly=True)

models/icpr2020dfdc/architectures/tripletnet.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+from . import fornet
+from .fornet import FeatureExtractor
+class TripletNet(FeatureExtractor):
+    """
+    Template class for triplet net
+    """
+    def __init__(self, feat_ext: FeatureExtractor):
+        super(TripletNet, self).__init__()
+        self.feat_ext = feat_ext()
+        if not hasattr(self.feat_ext, 'features'):
+            raise NotImplementedError('The provided feature extractor needs to provide a features() method')
+    def features(self, x):
+        return self.feat_ext.features(x)
+    def forward(self, x1, x2, x3):
+        x1 = self.features(x1)
+        x2 = self.features(x2)
+        x3 = self.features(x3)
+        return x1, x2, x3
+class EfficientNetB4(TripletNet):
+    def __init__(self):
+        super(EfficientNetB4, self).__init__(feat_ext=fornet.EfficientNetB4)
+class EfficientNetAutoAttB4(TripletNet):
+    def __init__(self):
+        super(EfficientNetAutoAttB4, self).__init__(feat_ext=fornet.EfficientNetAutoAttB4)

models/icpr2020dfdc/architectures/weights.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+weight_url = {
+'EfficientNetAutoAttB4ST_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4ST_DFDC_bestval-4df0ef7d2f380a5955affa78c35d0942ac1cd65229510353b252737775515a33.pth',
+'EfficientNetAutoAttB4ST_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4ST_FFPP_bestval-ddb357503b9b902e1b925c2550415604c4252b9b9ecafeb7369dc58cc16e9edd.pth',
+'EfficientNetAutoAttB4_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4_DFDC_bestval-72ed969b2a395fffe11a0d5bf0a635e7260ba2588c28683630d97ff7153389fc.pth',
+'EfficientNetAutoAttB4_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4_FFPP_bestval-b0c9e9522a7143cf119843e910234be5e30f77dc527b1b427cdffa5ce3bdbc25.pth',
+'EfficientNetB4ST_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4ST_DFDC_bestval-86f0a0701b18694dfb5e7837bd09fa8e48a5146c193227edccf59f1b038181c6.pth',
+'EfficientNetB4ST_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4ST_FFPP_bestval-ccd016668071be5bf5fff68e446d055441739ec7113fb1a6eee998f08396ae92.pth',
+'EfficientNetB4_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4_DFDC_bestval-c9f3663e2116d3356d056a0ce6453e0fc412a8df68ebd0902f07104d9129a09a.pth',
+'EfficientNetB4_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4_FFPP_bestval-93aaad84946829e793d1a67ed7e0309b535e2f2395acb4f8d16b92c0616ba8d7.pth',
+'Xception_DFDC':'https://f002.backblazeb2.com/file/icpr2020/Xception_DFDC_bestval-e826cdb64d73ef491e6b8ff8fce0e1e1b7fc1d8e2715bc51a56280fff17596f9.pth',
+'Xception_FFPP':'https://f002.backblazeb2.com/file/icpr2020/Xception_FFPP_bestval-bb119e4913cb8f816cd28a03f81f4c603d6351bf8e3f8e3eb99eebc923aecd22.pth',
+}

models/icpr2020dfdc/blazeface/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .blazeface import BlazeFace
+from .face_extract import FaceExtractor
+from .read_video import VideoReader

models/icpr2020dfdc/blazeface/anchors.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
+size 28800

models/icpr2020dfdc/blazeface/blazeface.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54ecff653feaaaf1f7d44b6aff28fd2fc50e483a4e847563b6dd261369c43ba4
+size 420224

models/icpr2020dfdc/blazeface/blazeface.py ADDED Viewed

	@@ -0,0 +1,417 @@

+from typing import List
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
+        super(BlazeBlock, self).__init__()
+        self.stride = stride
+        self.channel_pad = out_channels - in_channels
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
+                      kernel_size=kernel_size, stride=stride, padding=padding,
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+        self.act = nn.ReLU(inplace=True)
+    def forward(self, x):
+        if self.stride == 2:
+            h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+        if self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        return self.act(self.convs(h) + x)
+class BlazeFace(nn.Module):
+    """The BlazeFace face detection model from MediaPipe.
+    The version from MediaPipe is simpler than the one in the paper;
+    it does not use the "double" BlazeBlocks.
+    Because we won't be training this model, it doesn't need to have
+    batchnorm layers. These have already been "folded" into the conv
+    weights by TFLite.
+    The conversion to PyTorch is fairly straightforward, but there are
+    some small differences between TFLite and PyTorch in how they handle
+    padding on conv layers with stride 2.
+    This version works on batches, while the MediaPipe version can only
+    handle a single image at a time.
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/google/mediapipe/
+    """
+    input_size = (128, 128)
+    detection_keys = [
+        'ymin', 'xmin', 'ymax', 'xmax',
+        'kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y',
+        'conf'
+    ]
+    def __init__(self):
+        super(BlazeFace, self).__init__()
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.score_clipping_thresh = 100.0
+        self.x_scale = 128.0
+        self.y_scale = 128.0
+        self.h_scale = 128.0
+        self.w_scale = 128.0
+        self.min_score_thresh = 0.75
+        self.min_suppression_threshold = 0.3
+        self._define_layers()
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+            nn.ReLU(inplace=True),
+            BlazeBlock(24, 24),
+            BlazeBlock(24, 28),
+            BlazeBlock(28, 32, stride=2),
+            BlazeBlock(32, 36),
+            BlazeBlock(36, 42),
+            BlazeBlock(42, 48, stride=2),
+            BlazeBlock(48, 56),
+            BlazeBlock(56, 64),
+            BlazeBlock(64, 72),
+            BlazeBlock(72, 80),
+            BlazeBlock(80, 88),
+        )
+        self.backbone2 = nn.Sequential(
+            BlazeBlock(88, 96, stride=2),
+            BlazeBlock(96, 96),
+            BlazeBlock(96, 96),
+            BlazeBlock(96, 96),
+            BlazeBlock(96, 96),
+        )
+        self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
+        self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+        self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
+        self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+    def forward(self, x):
+        # TFLite uses slightly different padding on the first conv layer
+        # than PyTorch, so do it manually.
+        x = F.pad(x, (1, 2, 1, 2), "constant", 0)
+        b = x.shape[0]  # batch size, needed for reshaping later
+        x = self.backbone1(x)  # (b, 88, 16, 16)
+        h = self.backbone2(x)  # (b, 96, 8, 8)
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        c1 = self.classifier_8(x)  # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)  # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)  # (b, 512, 1)
+        c2 = self.classifier_16(h)  # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)  # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)  # (b, 384, 1)
+        c = torch.cat((c1, c2), dim=1)  # (b, 896, 1)
+        r1 = self.regressor_8(x)  # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)  # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 16)  # (b, 512, 16)
+        r2 = self.regressor_16(h)  # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)  # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 16)  # (b, 384, 16)
+        r = torch.cat((r1, r2), dim=1)  # (b, 896, 16)
+        return [r, c]
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert (self.anchors.ndimension() == 2)
+        assert (self.anchors.shape[0] == self.num_anchors)
+        assert (self.anchors.shape[1] == 4)
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 127.5 - 1.0
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be
+                 128 pixels.
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+    def predict_on_batch(self, x: np.ndarray or torch.Tensor, apply_nms: bool = True) -> List[torch.Tensor]:
+        """Makes a prediction on a batch of images.
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+            apply_nms: pass False to not apply non-max suppression
+        Returns:
+            A list containing a tensor of face detections for each image in
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+        assert x.shape[1] == 3
+        assert x.shape[2] == 128
+        assert x.shape[3] == 128
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out: torch.Tensor = self.__call__(x)
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+        # 4. Non-maximum suppression to remove overlapping detections:
+        return self.nms(detections) if apply_nms else detections
+    def nms(self, detections: List[torch.Tensor]) -> List[torch.Tensor]:
+        """Filters out overlapping detections."""
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, 17), device=self._device())
+            filtered_detections.append(faces)
+        return filtered_detections
+    def _tensors_to_detections(self, raw_box_tensor: torch.Tensor, raw_score_tensor: torch.Tensor, anchors) -> List[
+        torch.Tensor]:
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor
+        of shape (b, 896, 1) with the classification confidences.
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+        return output_detections
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+        for k in range(6):
+            offset = 4 + k * 2
+            keypoint_x = raw_boxes[..., offset] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+        The input detections should be a Tensor of shape (count, 17).
+        Returns a list of PyTorch tensors, one for each detected face.
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+        output_detections = []
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, 16], descending=True)
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+            # Compute the overlap between the first box and the other
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :16]
+                scores = detections[overlapping, 16:17]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:16] = weighted
+                weighted_detection[16] = total_score / len(overlapping)
+            output_detections.append(weighted_detection)
+        return output_detections
+    # IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2] - box_a[:, 0]) *
+              (box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2] - box_b[:, 0]) *
+              (box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)

models/icpr2020dfdc/blazeface/face_extract.py ADDED Viewed

	@@ -0,0 +1,470 @@

+import os
+from typing import Tuple, List
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from blazeface import BlazeFace
+class FaceExtractor:
+    """Wrapper for face extraction workflow."""
+    def __init__(self, video_read_fn = None, facedet: BlazeFace = None):
+        """Creates a new FaceExtractor.
+        Arguments:
+            video_read_fn: a function that takes in a path to a video file
+                and returns a tuple consisting of a NumPy array with shape
+                (num_frames, H, W, 3) and a list of frame indices, or None
+                in case of an error
+            facedet: the face detector object
+        """
+        self.video_read_fn = video_read_fn
+        self.facedet = facedet
+    def process_image(self, path: str = None, img: Image.Image or np.ndarray = None) -> dict:
+        """
+        Process a single image
+        :param path: Path to the image
+        :param img: image
+        :return:
+        """
+        if img is not None and path is not None:
+            raise ValueError('Only one argument between path and img can be specified')
+        if img is None and path is None:
+            raise ValueError('At least one argument between path and img must be specified')
+        target_size = self.facedet.input_size
+        if img is None:
+            img = np.asarray(Image.open(str(path)))
+        else:
+            img = np.asarray(img)
+        # Split the frames into several tiles. Resize the tiles to 128x128.
+        tiles, resize_info = self._tile_frames(np.expand_dims(img, 0), target_size)
+        # tiles has shape (num_tiles, target_size, target_size, 3)
+        # resize_info is a list of four elements [resize_factor_y, resize_factor_x, 0, 0]
+        # Run the face detector. The result is a list of PyTorch tensors,
+        # one for each tile in the batch.
+        detections = self.facedet.predict_on_batch(tiles, apply_nms=False)
+        # Convert the detections from 128x128 back to the original frame size.
+        detections = self._resize_detections(detections, target_size, resize_info)
+        # Because we have several tiles for each frame, combine the predictions
+        # from these tiles. The result is a list of PyTorch tensors, but now one
+        # for each frame (rather than each tile).
+        num_frames = 1
+        frame_size = (img.shape[1], img.shape[0])
+        detections = self._untile_detections(num_frames, frame_size, detections)
+        # The same face may have been detected in multiple tiles, so filter out
+        # overlapping detections. This is done separately for each frame.
+        detections = self.facedet.nms(detections)
+        # Crop the faces out of the original frame.
+        frameref_detections = self._add_margin_to_detections(detections[0], frame_size, 0.2)
+        faces = self._crop_faces(img, frameref_detections)
+        kpts = self._crop_kpts(img, detections[0], 0.3)
+        # Add additional information about the frame and detections.
+        scores = list(detections[0][:, 16].cpu().numpy())
+        frame_dict = {"frame_w": frame_size[0],
+                      "frame_h": frame_size[1],
+                      "faces": faces,
+                      "kpts": kpts,
+                      "detections": frameref_detections.cpu().numpy(),
+                      "scores": scores,
+                      }
+        # Sort faces by descending confidence
+        frame_dict = self._soft_faces_by_descending_score(frame_dict)
+        return frame_dict
+    def _soft_faces_by_descending_score(self, frame_dict: dict) -> dict:
+        if len(frame_dict['scores']) > 1:
+            sort_idxs = np.argsort(frame_dict['scores'])[::-1]
+            new_faces = [frame_dict['faces'][i] for i in sort_idxs]
+            new_kpts = [frame_dict['kpts'][i] for i in sort_idxs]
+            new_detections = frame_dict['detections'][sort_idxs]
+            new_scores = [frame_dict['scores'][i] for i in sort_idxs]
+            frame_dict['faces'] = new_faces
+            frame_dict['kpts'] = new_kpts
+            frame_dict['detections'] = new_detections
+            frame_dict['scores'] = new_scores
+        return frame_dict
+    def process_videos(self, input_dir, filenames, video_idxs) -> List[dict]:
+        """For the specified selection of videos, grabs one or more frames
+        from each video, runs the face detector, and tries to find the faces
+        in each frame.
+        The frames are split into tiles, and the tiles from the different videos
+        are concatenated into a single batch. This means the face detector gets
+        a batch of size len(video_idxs) * num_frames * num_tiles (usually 3).
+        Arguments:
+            input_dir: base folder where the video files are stored
+            filenames: list of all video files in the input_dir
+            video_idxs: one or more indices from the filenames list; these
+                are the videos we'll actually process
+        Returns a list of dictionaries, one for each frame read from each video.
+        This dictionary contains:
+            - video_idx: the video this frame was taken from
+            - frame_idx: the index of the frame in the video
+            - frame_w, frame_h: original dimensions of the frame
+            - faces: a list containing zero or more NumPy arrays with a face crop
+            - scores: a list array with the confidence score for each face crop
+        If reading a video failed for some reason, it will not appear in the
+        output array. Note that there's no guarantee a given video will actually
+        have num_frames results (as soon as a reading problem is encountered for
+        a video, we continue with the next video).
+        """
+        target_size = self.facedet.input_size
+        videos_read = []
+        frames_read = []
+        frames = []
+        tiles = []
+        resize_info = []
+        for video_idx in video_idxs:
+            # Read the full-size frames from this video.
+            filename = filenames[video_idx]
+            video_path = os.path.join(input_dir, filename)
+            result = self.video_read_fn(video_path)
+            # Error? Then skip this video.
+            if result is None: continue
+            videos_read.append(video_idx)
+            # Keep track of the original frames (need them later).
+            my_frames, my_idxs = result
+            frames.append(my_frames)
+            frames_read.append(my_idxs)
+            # Split the frames into several tiles. Resize the tiles to 128x128.
+            my_tiles, my_resize_info = self._tile_frames(my_frames, target_size)
+            tiles.append(my_tiles)
+            resize_info.append(my_resize_info)
+        if len(tiles) == 0:
+            return []
+        # Put all the tiles for all the frames from all the videos into
+        # a single batch.
+        batch = np.concatenate(tiles)
+        # Run the face detector. The result is a list of PyTorch tensors,
+        # one for each image in the batch.
+        all_detections = self.facedet.predict_on_batch(batch, apply_nms=False)
+        result = []
+        offs = 0
+        for v in range(len(tiles)):
+            # Not all videos may have the same number of tiles, so find which
+            # detections go with which video.
+            num_tiles = tiles[v].shape[0]
+            detections = all_detections[offs:offs + num_tiles]
+            offs += num_tiles
+            # Convert the detections from 128x128 back to the original frame size.
+            detections = self._resize_detections(detections, target_size, resize_info[v])
+            # Because we have several tiles for each frame, combine the predictions
+            # from these tiles. The result is a list of PyTorch tensors, but now one
+            # for each frame (rather than each tile).
+            num_frames = frames[v].shape[0]
+            frame_size = (frames[v].shape[2], frames[v].shape[1])
+            detections = self._untile_detections(num_frames, frame_size, detections)
+            # The same face may have been detected in multiple tiles, so filter out
+            # overlapping detections. This is done separately for each frame.
+            detections = self.facedet.nms(detections)
+            for i in range(len(detections)):
+                # Crop the faces out of the original frame.
+                frameref_detections = self._add_margin_to_detections(detections[i], frame_size, 0.2)
+                faces = self._crop_faces(frames[v][i], frameref_detections)
+                kpts = self._crop_kpts(frames[v][i], detections[i], 0.3)
+                # Add additional information about the frame and detections.
+                scores = list(detections[i][:, 16].cpu().numpy())
+                frame_dict = {"video_idx": videos_read[v],
+                              "frame_idx": frames_read[v][i],
+                              "frame_w": frame_size[0],
+                              "frame_h": frame_size[1],
+                              "frame": frames[v][i],
+                              "faces": faces,
+                              "kpts": kpts,
+                              "detections": frameref_detections.cpu().numpy(),
+                              "scores": scores,
+                              }
+                # Sort faces by descending confidence
+                frame_dict = self._soft_faces_by_descending_score(frame_dict)
+                result.append(frame_dict)
+        return result
+    def process_video(self, video_path):
+        """Convenience method for doing face extraction on a single video."""
+        input_dir = os.path.dirname(video_path)
+        filenames = [os.path.basename(video_path)]
+        return self.process_videos(input_dir, filenames, [0])
+    def _tile_frames(self, frames: np.ndarray, target_size: Tuple[int, int]) -> (np.ndarray, List[float]):
+        """Splits each frame into several smaller, partially overlapping tiles
+        and resizes each tile to target_size.
+        After a bunch of experimentation, I found that for a 1920x1080 video,
+        BlazeFace works better on three 1080x1080 windows. These overlap by 420
+        pixels. (Two windows also work but it's best to have a clean center crop
+        in there as well.)
+        I also tried 6 windows of size 720x720 (horizontally: 720|360, 360|720;
+        vertically: 720|1200, 480|720|480, 1200|720) but that gives many false
+        positives when a window has no face in it.
+        For a video in portrait orientation (1080x1920), we only take a single
+        crop of the top-most 1080 pixels. If we split up the video vertically,
+        then we might get false positives again.
+        (NOTE: Not all videos are necessarily 1080p but the code can handle this.)
+        Arguments:
+            frames: NumPy array of shape (num_frames, height, width, 3)
+            target_size: (width, height)
+        Returns:
+            - a new (num_frames * N, target_size[1], target_size[0], 3) array
+              where N is the number of tiles used.
+            - a list [scale_w, scale_h, offset_x, offset_y] that describes how
+              to map the resized and cropped tiles back to the original image
+              coordinates. This is needed for scaling up the face detections
+              from the smaller image to the original image, so we can take the
+              face crops in the original coordinate space.
+        """
+        num_frames, H, W, _ = frames.shape
+        num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)
+        splits = np.zeros((num_frames * num_v * num_h, target_size[1], target_size[0], 3), dtype=np.uint8)
+        i = 0
+        for f in range(num_frames):
+            y = 0
+            for v in range(num_v):
+                x = 0
+                for h in range(num_h):
+                    crop = frames[f, y:y + split_size, x:x + split_size, :]
+                    splits[i] = cv2.resize(crop, target_size, interpolation=cv2.INTER_AREA)
+                    x += x_step
+                    i += 1
+                y += y_step
+        resize_info = [split_size / target_size[0], split_size / target_size[1], 0, 0]
+        return splits, resize_info
+    def get_tiles_params(self, H, W):
+        split_size = min(H, W, 720)
+        x_step = (W - split_size) // 2
+        y_step = (H - split_size) // 2
+        num_v = (H - split_size) // y_step + 1 if y_step > 0 else 1
+        num_h = (W - split_size) // x_step + 1 if x_step > 0 else 1
+        return num_h, num_v, split_size, x_step, y_step
+    def _resize_detections(self, detections, target_size, resize_info):
+        """Converts a list of face detections back to the original
+        coordinate system.
+        Arguments:
+            detections: a list containing PyTorch tensors of shape (num_faces, 17)
+            target_size: (width, height)
+            resize_info: [scale_w, scale_h, offset_x, offset_y]
+        """
+        projected = []
+        target_w, target_h = target_size
+        scale_w, scale_h, offset_x, offset_y = resize_info
+        for i in range(len(detections)):
+            detection = detections[i].clone()
+            # ymin, xmin, ymax, xmax
+            for k in range(2):
+                detection[:, k * 2] = (detection[:, k * 2] * target_h - offset_y) * scale_h
+                detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_w - offset_x) * scale_w
+            # keypoints are x,y
+            for k in range(2, 8):
+                detection[:, k * 2] = (detection[:, k * 2] * target_w - offset_x) * scale_w
+                detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_h - offset_y) * scale_h
+            projected.append(detection)
+        return projected
+    def _untile_detections(self, num_frames: int, frame_size: Tuple[int, int], detections: List[torch.Tensor]) -> List[
+        torch.Tensor]:
+        """With N tiles per frame, there also are N times as many detections.
+        This function groups together the detections for a given frame; it is
+        the complement to tile_frames().
+        """
+        combined_detections = []
+        W, H = frame_size
+        num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)
+        i = 0
+        for f in range(num_frames):
+            detections_for_frame = []
+            y = 0
+            for v in range(num_v):
+                x = 0
+                for h in range(num_h):
+                    # Adjust the coordinates based on the split positions.
+                    detection = detections[i].clone()
+                    if detection.shape[0] > 0:
+                        for k in range(2):
+                            detection[:, k * 2] += y
+                            detection[:, k * 2 + 1] += x
+                        for k in range(2, 8):
+                            detection[:, k * 2] += x
+                            detection[:, k * 2 + 1] += y
+                    detections_for_frame.append(detection)
+                    x += x_step
+                    i += 1
+                y += y_step
+            combined_detections.append(torch.cat(detections_for_frame))
+        return combined_detections
+    def _add_margin_to_detections(self, detections: torch.Tensor, frame_size: Tuple[int, int],
+                                  margin: float = 0.2) -> torch.Tensor:
+        """Expands the face bounding box.
+        NOTE: The face detections often do not include the forehead, which
+        is why we use twice the margin for ymin.
+        Arguments:
+            detections: a PyTorch tensor of shape (num_detections, 17)
+            frame_size: maximum (width, height)
+            margin: a percentage of the bounding box's height
+        Returns a PyTorch tensor of shape (num_detections, 17).
+        """
+        offset = torch.round(margin * (detections[:, 2] - detections[:, 0]))
+        detections = detections.clone()
+        detections[:, 0] = torch.clamp(detections[:, 0] - offset * 2, min=0)  # ymin
+        detections[:, 1] = torch.clamp(detections[:, 1] - offset, min=0)  # xmin
+        detections[:, 2] = torch.clamp(detections[:, 2] + offset, max=frame_size[1])  # ymax
+        detections[:, 3] = torch.clamp(detections[:, 3] + offset, max=frame_size[0])  # xmax
+        return detections
+    def _crop_faces(self, frame: np.ndarray, detections: torch.Tensor) -> List[np.ndarray]:
+        """Copies the face region(s) from the given frame into a set
+        of new NumPy arrays.
+        Arguments:
+            frame: a NumPy array of shape (H, W, 3)
+            detections: a PyTorch tensor of shape (num_detections, 17)
+        Returns a list of NumPy arrays, one for each face crop. If there
+        are no faces detected for this frame, returns an empty list.
+        """
+        faces = []
+        for i in range(len(detections)):
+            ymin, xmin, ymax, xmax = detections[i, :4].cpu().numpy().astype(int)
+            face = frame[ymin:ymax, xmin:xmax, :]
+            faces.append(face)
+        return faces
+    def _crop_kpts(self, frame: np.ndarray, detections: torch.Tensor, face_fraction: float):
+        """Copies the parts region(s) from the given frame into a set
+        of new NumPy arrays.
+        Arguments:
+            frame: a NumPy array of shape (H, W, 3)
+            detections: a PyTorch tensor of shape (num_detections, 17)
+            face_fraction: float between 0 and 1 indicating how big are the parts to be extracted w.r.t the whole face
+        Returns a list of NumPy arrays, one for each face crop. If there
+        are no faces detected for this frame, returns an empty list.
+        """
+        faces = []
+        for i in range(len(detections)):
+            kpts = []
+            size = int(face_fraction * min(detections[i, 2] - detections[i, 0], detections[i, 3] - detections[i, 1]))
+            kpts_coords = detections[i, 4:16].cpu().numpy().astype(int)
+            for kpidx in range(6):
+                kpx, kpy = kpts_coords[kpidx * 2:kpidx * 2 + 2]
+                kpt = frame[kpy - size // 2:kpy - size // 2 + size, kpx - size // 2:kpx - size // 2 + size, ]
+                kpts.append(kpt)
+            faces.append(kpts)
+        return faces
+    def remove_large_crops(self, crops, pct=0.1):
+        """Removes faces from the results if they take up more than X%
+        of the video. Such a face is likely a false positive.
+        This is an optional postprocessing step. Modifies the original
+        data structure.
+        Arguments:
+            crops: a list of dictionaries with face crop data
+            pct: maximum portion of the frame a crop may take up
+        """
+        for i in range(len(crops)):
+            frame_data = crops[i]
+            video_area = frame_data["frame_w"] * frame_data["frame_h"]
+            faces = frame_data["faces"]
+            scores = frame_data["scores"]
+            new_faces = []
+            new_scores = []
+            for j in range(len(faces)):
+                face = faces[j]
+                face_H, face_W, _ = face.shape
+                face_area = face_H * face_W
+                if face_area / video_area < 0.1:
+                    new_faces.append(face)
+                    new_scores.append(scores[j])
+            frame_data["faces"] = new_faces
+            frame_data["scores"] = new_scores
+    def keep_only_best_face(self, crops):
+        """For each frame, only keeps the face with the highest confidence.
+        This gets rid of false positives, but obviously is problematic for
+        videos with two people!
+        This is an optional postprocessing step. Modifies the original
+        data structure.
+        """
+        for i in range(len(crops)):
+            frame_data = crops[i]
+            if len(frame_data["faces"]) > 0:
+                frame_data["faces"] = frame_data["faces"][:1]
+                frame_data["scores"] = frame_data["scores"][:1]
+    # TODO: def filter_likely_false_positives(self, crops):
+    #   if only some frames have more than 1 face, it's likely a false positive
+    #   if most frames have more than 1 face, it's probably two people
+    #   so find the % of frames with > 1 face; if > 0.X, keep the two best faces
+    # TODO: def filter_by_score(self, crops, min_score) to remove any
+    # crops with a confidence score lower than min_score
+    # TODO: def sort_by_histogram(self, crops) for videos with 2 people.

models/icpr2020dfdc/blazeface/read_video.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import cv2
+import numpy as np
+class VideoReader:
+    """Helper class for reading one or more frames from a video file."""
+    def __init__(self, verbose=True, insets=(0, 0)):
+        """Creates a new VideoReader.
+        Arguments:
+            verbose: whether to print warnings and error messages
+            insets: amount to inset the image by, as a percentage of
+                (width, height). This lets you "zoom in" to an image
+                to remove unimportant content around the borders.
+                Useful for face detection, which may not work if the
+                faces are too small.
+        """
+        self.verbose = verbose
+        self.insets = insets
+    def read_frames(self, path, num_frames, jitter=0, seed=None):
+        """Reads frames that are always evenly spaced throughout the video.
+        Arguments:
+            path: the video file
+            num_frames: how many frames to read, -1 means the entire video
+                (warning: this will take up a lot of memory!)
+            jitter: if not 0, adds small random offsets to the frame indices;
+                this is useful so we don't always land on even or odd frames
+            seed: random seed for jittering; if you set this to a fixed value,
+                you probably want to set it only on the first video
+        """
+        assert num_frames > 0
+        capture = cv2.VideoCapture(path)
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        if frame_count <= 0: return None
+        frame_idxs = np.linspace(0, frame_count - 1, num_frames, endpoint=True, dtype=int)
+        frame_idxs = np.unique(frame_idxs)  # Avoid repeating frame idxs otherwise it breaks reading
+        if jitter > 0:
+            np.random.seed(seed)
+            jitter_offsets = np.random.randint(-jitter, jitter, len(frame_idxs))
+            frame_idxs = np.clip(frame_idxs + jitter_offsets, 0, frame_count - 1)
+        result = self._read_frames_at_indices(path, capture, frame_idxs)
+        capture.release()
+        return result
+    def read_random_frames(self, path, num_frames, seed=None):
+        """Picks the frame indices at random.
+        Arguments:
+            path: the video file
+            num_frames: how many frames to read, -1 means the entire video
+                (warning: this will take up a lot of memory!)
+        """
+        assert num_frames > 0
+        np.random.seed(seed)
+        capture = cv2.VideoCapture(path)
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        if frame_count <= 0: return None
+        frame_idxs = sorted(np.random.choice(np.arange(0, frame_count), num_frames))
+        result = self._read_frames_at_indices(path, capture, frame_idxs)
+        capture.release()
+        return result
+    def read_frames_at_indices(self, path, frame_idxs):
+        """Reads frames from a video and puts them into a NumPy array.
+        Arguments:
+            path: the video file
+            frame_idxs: a list of frame indices. Important: should be
+                sorted from low-to-high! If an index appears multiple
+                times, the frame is still read only once.
+        Returns:
+            - a NumPy array of shape (num_frames, height, width, 3)
+            - a list of the frame indices that were read
+        Reading stops if loading a frame fails, in which case the first
+        dimension returned may actually be less than num_frames.
+        Returns None if an exception is thrown for any reason, or if no
+        frames were read.
+        """
+        assert len(frame_idxs) > 0
+        capture = cv2.VideoCapture(path)
+        result = self._read_frames_at_indices(path, capture, frame_idxs)
+        capture.release()
+        return result
+    def _read_frames_at_indices(self, path, capture, frame_idxs):
+        try:
+            frames = []
+            idxs_read = []
+            for frame_idx in range(frame_idxs[0], frame_idxs[-1] + 1):
+                # Get the next frame, but don't decode if we're not using it.
+                ret = capture.grab()
+                if not ret:
+                    if self.verbose:
+                        print("Error grabbing frame %d from movie %s" % (frame_idx, path))
+                    break
+                # Need to look at this frame?
+                current = len(idxs_read)
+                if frame_idx == frame_idxs[current]:
+                    ret, frame = capture.retrieve()
+                    if not ret or frame is None:
+                        if self.verbose:
+                            print("Error retrieving frame %d from movie %s" % (frame_idx, path))
+                        break
+                    frame = self._postprocess_frame(frame)
+                    frames.append(frame)
+                    idxs_read.append(frame_idx)
+            if len(frames) > 0:
+                return np.stack(frames), idxs_read
+            if self.verbose:
+                print("No frames read from movie %s" % path)
+            return None
+        except:
+            if self.verbose:
+                print("Exception while reading movie %s" % path)
+            return None
+    def read_middle_frame(self, path):
+        """Reads the frame from the middle of the video."""
+        capture = cv2.VideoCapture(path)
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        result = self._read_frame_at_index(path, capture, frame_count // 2)
+        capture.release()
+        return result
+    def read_frame_at_index(self, path, frame_idx):
+        """Reads a single frame from a video.
+        If you just want to read a single frame from the video, this is more
+        efficient than scanning through the video to find the frame. However,
+        for reading multiple frames it's not efficient.
+        My guess is that a "streaming" approach is more efficient than a
+        "random access" approach because, unless you happen to grab a keyframe,
+        the decoder still needs to read all the previous frames in order to
+        reconstruct the one you're asking for.
+        Returns a NumPy array of shape (1, H, W, 3) and the index of the frame,
+        or None if reading failed.
+        """
+        capture = cv2.VideoCapture(path)
+        result = self._read_frame_at_index(path, capture, frame_idx)
+        capture.release()
+        return result
+    def _read_frame_at_index(self, path, capture, frame_idx):
+        capture.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+        ret, frame = capture.read()
+        if not ret or frame is None:
+            if self.verbose:
+                print("Error retrieving frame %d from movie %s" % (frame_idx, path))
+            return None
+        else:
+            frame = self._postprocess_frame(frame)
+            return np.expand_dims(frame, axis=0), [frame_idx]
+    def _postprocess_frame(self, frame):
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if self.insets[0] > 0:
+            W = frame.shape[1]
+            p = int(W * self.insets[0])
+            frame = frame[:, p:-p, :]
+        if self.insets[1] > 0:
+            H = frame.shape[1]
+            q = int(H * self.insets[1])
+            frame = frame[q:-q, :, :]
+        return frame
+class VideoReaderIspl(VideoReader):
+    """
+    Derived VideoReader class with overriden read_frames method
+    """
+    def read_frames_with_hop(self, path: str, num_frames: int = -1, fps: int = -1):
+        """Reads frames up to a certain number spaced throughout the video with a rate decided by the user.
+        Arguments:
+            path: the video file
+            num_frames: how many frames to read, -1 means the entire video
+                (warning: this will take up a lot of memory!)
+            fps: how many frames per second to pick
+        """
+        assert num_frames > 0
+        capture = cv2.VideoCapture(path)
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        if frame_count <= 0: return None
+        video_rate = capture.get(cv2.CAP_PROP_FPS)
+        hop = 1 if fps == -1 else max(video_rate // fps, 1)
+        end_pts = frame_count if num_frames == -1 else num_frames * hop
+        frame_idxs = np.arange(0, end_pts - 1, hop, endpoint=True, dtype=int)
+        result = self._read_frames_at_indices(path, capture, frame_idxs)
+        capture.release()
+        return result

models/icpr2020dfdc/environment.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: icpr2020
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - av=6.2.0
+  - albumentations
+  - cudatoolkit
+  - ffmpeg
+  - jupyter
+  - numpy
+  - opencv=3.4.2
+  - py-opencv=3.4.2
+  - python=3.6.9
+  - pip
+  - pytorch=1.4.0
+  - torchvision
+  - tqdm
+  - pandas
+  - pip:
+    - tensorboardx==2.0
+    - efficientnet-pytorch
+    - scikit-learn

models/icpr2020dfdc/extract_faces.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""
+Extract faces
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+import sys
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from pathlib import Path
+from typing import Tuple, List
+import numpy as np
+import pandas as pd
+import torch
+import torch.cuda
+from PIL import Image
+from tqdm import tqdm
+import blazeface
+from blazeface import BlazeFace, VideoReader, FaceExtractor
+from isplutils.utils import adapt_bb
+def parse_args(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=Path, help='Videos root directory', required=True)
+    parser.add_argument('--videodf', type=Path, help='Path to read the videos DataFrame', required=True)
+    parser.add_argument('--facesfolder', type=Path, help='Faces output root directory', required=True)
+    parser.add_argument('--facesdf', type=Path, help='Path to save the output DataFrame of faces', required=True)
+    parser.add_argument('--checkpoint', type=Path, help='Path to save the temporary per-video outputs', required=True)
+    parser.add_argument('--fpv', type=int, default=32, help='Frames per video')
+    parser.add_argument('--device', type=torch.device,
+                        default=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
+                        help='Device to use for face extraction')
+    parser.add_argument('--collateonly', help='Only perform collation of pre-existing results', action='store_true')
+    parser.add_argument('--noindex', help='Do not rebuild the index', action='store_false')
+    parser.add_argument('--batch', type=int, help='Batch size', default=16)
+    parser.add_argument('--threads', type=int, help='Number of threads', default=8)
+    parser.add_argument('--offset', type=int, help='Offset to start extraction', default=0)
+    parser.add_argument('--num', type=int, help='Number of videos to process', default=0)
+    parser.add_argument('--lazycheck', action='store_true', help='Lazy check of existing video indexes')
+    parser.add_argument('--deepcheck', action='store_true', help='Try to open every image')
+    return parser.parse_args(argv)
+def main(argv):
+    args = parse_args(argv)
+    ## Parameters parsing
+    device: torch.device = args.device
+    source_dir: Path = args.source
+    facedestination_dir: Path = args.facesfolder
+    frames_per_video: int = args.fpv
+    videodataset_path: Path = args.videodf
+    facesdataset_path: Path = args.facesdf
+    collateonly: bool = args.collateonly
+    batch_size: int = args.batch
+    threads: int = args.threads
+    offset: int = args.offset
+    num: int = args.num
+    lazycheck: bool = args.lazycheck
+    deepcheck: bool = args.deepcheck
+    checkpoint_folder: Path = args.checkpoint
+    index_enable: bool = args.noindex
+    ## Parameters
+    face_size = 512
+    print('Loading video DataFrame')
+    df_videos = pd.read_pickle(videodataset_path)
+    if num > 0:
+        df_videos_process = df_videos.iloc[offset:offset + num]
+    else:
+        df_videos_process = df_videos.iloc[offset:]
+    if not collateonly:
+        ## Blazeface loading
+        print('Loading face extractor')
+        facedet = BlazeFace().to(device)
+        facedet.load_weights("blazeface/blazeface.pth")
+        facedet.load_anchors("blazeface/anchors.npy")
+        videoreader = VideoReader(verbose=False)
+        video_read_fn = lambda x: videoreader.read_frames(x, num_frames=frames_per_video)
+        face_extractor = FaceExtractor(video_read_fn, facedet)
+        ## Face extraction
+        with ThreadPoolExecutor(threads) as p:
+            for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos_process), step=batch_size),
+                                   desc='Extracting faces'):
+                tosave_list = list(p.map(partial(process_video,
+                                                 source_dir=source_dir,
+                                                 facedestination_dir=facedestination_dir,
+                                                 checkpoint_folder=checkpoint_folder,
+                                                 face_size=face_size,
+                                                 face_extractor=face_extractor,
+                                                 lazycheck=lazycheck,
+                                                 deepcheck=deepcheck,
+                                                 ),
+                                         df_videos_process.iloc[batch_idx0:batch_idx0 + batch_size].iterrows()))
+                for tosave in tosave_list:
+                    if tosave is not None:
+                        if len(tosave[2]):
+                            list(p.map(save_jpg, tosave[2]))
+                        tosave[1].parent.mkdir(parents=True, exist_ok=True)
+                        tosave[0].to_pickle(str(tosave[1]))
+    if index_enable:
+        # Collect checkpoints
+        df_videos['nfaces'] = np.zeros(len(df_videos), np.uint8)
+        faces_dataset = []
+        for idx, record in tqdm(df_videos.iterrows(), total=len(df_videos), desc='Collecting faces results'):
+            # Checkpoint
+            video_face_checkpoint_path = checkpoint_folder.joinpath(record['path']).with_suffix('.faces.pkl')
+            if video_face_checkpoint_path.exists():
+                try:
+                    df_video_faces = pd.read_pickle(str(video_face_checkpoint_path))
+                    # Fix same attribute issue
+                    df_video_faces = df_video_faces.rename(columns={'subject': 'videosubject'}, errors='ignore')
+                    nfaces = len(
+                        np.unique(df_video_faces.index.map(lambda x: int(x.split('_subj')[1].split('.jpg')[0]))))
+                    df_videos.loc[idx, 'nfaces'] = nfaces
+                    faces_dataset.append(df_video_faces)
+                except Exception as e:
+                    print('Error while reading: {}'.format(video_face_checkpoint_path))
+                    print(e)
+                    video_face_checkpoint_path.unlink()
+        if len(faces_dataset) == 0:
+            raise ValueError(f'No checkpoint found from face extraction. '
+                             f'Is the the source path {source_dir} correct for the videos in your dataframe?')
+        # Save videos with updated faces
+        print('Saving videos DataFrame to {}'.format(videodataset_path))
+        df_videos.to_pickle(str(videodataset_path))
+        if offset > 0:
+            if num > 0:
+                if facesdataset_path.is_dir():
+                    facesdataset_path = facesdataset_path.joinpath(
+                        'faces_df_from_video_{}_to_video_{}.pkl'.format(offset, num + offset))
+                else:
+                    facesdataset_path = facesdataset_path.parent.joinpath(
+                        str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}_to_video_{}.pkl'.format(offset,
+                                                                                                                 num + offset))
+            else:
+                if facesdataset_path.is_dir():
+                    facesdataset_path = facesdataset_path.joinpath('faces_df_from_video_{}.pkl'.format(offset))
+                else:
+                    facesdataset_path = facesdataset_path.parent.joinpath(
+                        str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}.pkl'.format(offset))
+        elif num > 0:
+            if facesdataset_path.is_dir():
+                facesdataset_path = facesdataset_path.joinpath(
+                    'faces_df_from_video_{}_to_video_{}.pkl'.format(0, num))
+            else:
+                facesdataset_path = facesdataset_path.parent.joinpath(
+                    str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}_to_video_{}.pkl'.format(0, num))
+        else:
+            if facesdataset_path.is_dir():
+                facesdataset_path = facesdataset_path.joinpath('faces_df.pkl')  # just a check if the path is a dir
+        # Creates directory (if doesn't exist)
+        facesdataset_path.parent.mkdir(parents=True, exist_ok=True)
+        print('Saving faces DataFrame to {}'.format(facesdataset_path))
+        df_faces = pd.concat(faces_dataset, axis=0, )
+        df_faces['video'] = df_faces['video'].astype('category')
+        for key in ['kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x',
+                    'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left',
+                    'top', 'right', 'bottom', ]:
+            df_faces[key] = df_faces[key].astype(np.int16)
+        df_faces['videosubject'] = df_faces['videosubject'].astype(np.int8)
+        # Eventually remove duplicates
+        df_faces = df_faces.loc[~df_faces.index.duplicated(keep='first')]
+        fields_to_preserve_from_video = [i for i in
+                                         ['folder', 'subject', 'scene', 'cluster', 'nfaces', 'test'] if
+                                         i in df_videos]
+        df_faces = pd.merge(df_faces, df_videos[fields_to_preserve_from_video], left_on='video',
+                            right_index=True)
+        df_faces.to_pickle(str(facesdataset_path))
+    print('Completed!')
+def save_jpg(args: Tuple[Image.Image, Path or str]):
+    image, path = args
+    image.save(path, quality=95, subsampling='4:4:4')
+def process_video(item: Tuple[pd.Index, pd.Series],
+                  source_dir: Path,
+                  facedestination_dir: Path,
+                  checkpoint_folder: Path,
+                  face_size: int,
+                  face_extractor: FaceExtractor,
+                  lazycheck: bool = False,
+                  deepcheck: bool = False,
+                  ) -> (pd.DataFrame, Path, List[Tuple[Image.Image, Path]]) or None:
+    # Instatiate Index and Series
+    idx, record = item
+    # Checkpoint
+    video_faces_checkpoint_path = checkpoint_folder.joinpath(record['path']).with_suffix('.faces.pkl')
+    if not lazycheck:
+        if video_faces_checkpoint_path.exists():
+            try:
+                df_video_faces = pd.read_pickle(str(video_faces_checkpoint_path))
+                for _, r in df_video_faces.iterrows():
+                    face_path = facedestination_dir.joinpath(r.name)
+                    assert (face_path.exists())
+                    if deepcheck:
+                        img = Image.open(face_path)
+                        img_arr = np.asarray(img)
+                        assert (img_arr.ndim == 3)
+                        assert (np.prod(img_arr.shape) > 0)
+            except Exception as e:
+                print('Error while checking: {}'.format(video_faces_checkpoint_path))
+                print(e)
+                video_faces_checkpoint_path.unlink()
+    if not (video_faces_checkpoint_path.exists()):
+        try:
+            video_face_dict_list = []
+            # Load faces
+            current_video_path = source_dir.joinpath(record['path'])
+            if not current_video_path.exists():
+                raise FileNotFoundError(f'Unable to find {current_video_path}.'
+                                        f'Are you sure that {source_dir} is the correct source directory for the video '
+                                        f'you indexed in the dataframe?')
+            frames = face_extractor.process_video(current_video_path)
+            if len(frames) == 0:
+                return
+            face_extractor.keep_only_best_face(frames)
+            for frame_idx, frame in enumerate(frames):
+                frames[frame_idx]['subjects'] = [0] * len(frames[frame_idx]['detections'])
+            # Extract and save faces, bounding boxes, keypoints
+            images_to_save: List[Tuple[Image.Image, Path]] = []
+            for frame_idx, frame in enumerate(frames):
+                if len(frames[frame_idx]['detections']):
+                    fullframe = Image.fromarray(frames[frame_idx]['frame'])
+                    # Preserve the only found face even if not a good one, otherwise preserve only clusters > -1
+                    subjects = np.unique(frames[frame_idx]['subjects'])
+                    if len(subjects) > 1:
+                        subjects = np.asarray([s for s in subjects if s > -1])
+                    for face_idx, _ in enumerate(frame['faces']):
+                        subj_id = frames[frame_idx]['subjects'][face_idx]
+                        if subj_id in subjects:  # Exclude outliers if other faces detected
+                            face_path = facedestination_dir.joinpath(record['path'], 'fr{:03d}_subj{:1d}.jpg'.format(
+                                frames[frame_idx]['frame_idx'], subj_id))
+                            face_dict = {'facepath': str(face_path.relative_to(facedestination_dir)), 'video': idx,
+                                         'label': record['label'], 'videosubject': subj_id,
+                                         'original': record['original']}
+                            # add attibutes for ff++
+                            if 'class' in record.keys():
+                                face_dict.update({'class': record['class']})
+                            if 'source' in record.keys():
+                                face_dict.update({'source': record['source']})
+                            if 'quality' in record.keys():
+                                face_dict.update({'quality': record['quality']})
+                            for field_idx, key in enumerate(blazeface.BlazeFace.detection_keys):
+                                face_dict[key] = frames[frame_idx]['detections'][face_idx][field_idx]
+                            cropping_bb = adapt_bb(frame_height=fullframe.height,
+                                                   frame_width=fullframe.width,
+                                                   bb_height=face_size,
+                                                   bb_width=face_size,
+                                                   left=face_dict['xmin'],
+                                                   top=face_dict['ymin'],
+                                                   right=face_dict['xmax'],
+                                                   bottom=face_dict['ymax'])
+                            face = fullframe.crop(cropping_bb)
+                            for key in blazeface.BlazeFace.detection_keys:
+                                if (key[0] == 'k' and key[-1] == 'x') or (key[0] == 'x'):
+                                    face_dict[key] -= cropping_bb[0]
+                                elif (key[0] == 'k' and key[-1] == 'y') or (key[0] == 'y'):
+                                    face_dict[key] -= cropping_bb[1]
+                            face_dict['left'] = face_dict.pop('xmin')
+                            face_dict['top'] = face_dict.pop('ymin')
+                            face_dict['right'] = face_dict.pop('xmax')
+                            face_dict['bottom'] = face_dict.pop('ymax')
+                            face_path.parent.mkdir(parents=True, exist_ok=True)
+                            images_to_save.append((face, face_path))
+                            video_face_dict_list.append(face_dict)
+            if len(video_face_dict_list) > 0:
+                df_video_faces = pd.DataFrame(video_face_dict_list)
+                df_video_faces.index = df_video_faces['facepath']
+                del df_video_faces['facepath']
+                # type conversions
+                for key in ['kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y',
+                            'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left', 'top',
+                            'right', 'bottom']:
+                    df_video_faces[key] = df_video_faces[key].astype(np.int16)
+                df_video_faces['conf'] = df_video_faces['conf'].astype(np.float32)
+                df_video_faces['video'] = df_video_faces['video'].astype('category')
+                video_faces_checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
+            else:
+                print('No faces extracted for video {}'.format(record['path']))
+                df_video_faces = pd.DataFrame()
+            return df_video_faces, video_faces_checkpoint_path, images_to_save
+        except Exception as e:
+            print('Error while processing: {}'.format(record['path']))
+            print("-" * 60)
+            traceback.print_exc(file=sys.stdout, limit=5)
+            print("-" * 60)
+            return
+if __name__ == '__main__':
+    main(sys.argv[1:])

models/icpr2020dfdc/index_celebdf.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Index Celeb-DF v2
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+from multiprocessing import Pool
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from isplutils.utils import extract_meta_av, extract_meta_cv
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=Path, help='Source dir',
+                        required=True)
+    parser.add_argument('--videodataset', type=Path, default='data/celebdf_videos.pkl',
+                        help='Path to save the videos DataFrame')
+    args = parser.parse_args()
+    ## Parameters parsing
+    source_dir: Path = args.source
+    videodataset_path: Path = args.videodataset
+    # Create ouput folder (if doesn't exist)
+    videodataset_path.parent.mkdir(parents=True, exist_ok=True)
+    ## DataFrame
+    if videodataset_path.exists():
+        print('Loading video DataFrame')
+        df_videos = pd.read_pickle(videodataset_path)
+    else:
+        print('Creating video DataFrame')
+        split_file = Path(source_dir).joinpath('List_of_testing_videos.txt')
+        if not split_file.exists():
+            raise FileNotFoundError('Unable to find "List_of_testing_videos.txt" in {}'.format(source_dir))
+        test_videos_df = pd.read_csv(split_file, delimiter=' ', header=0, index_col=1)
+        ff_videos = Path(source_dir).rglob('*.mp4')
+        df_videos = pd.DataFrame(
+            {'path': [f.relative_to(source_dir) for f in ff_videos]})
+        df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
+        with Pool() as p:
+            meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
+        meta = np.stack(meta)
+        df_videos.loc[:, ['height', 'width', 'frames']] = meta
+        # Fix for videos that av cannot decode properly
+        for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
+            meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
+            df_videos.loc[idx, ['height', 'width', 'frames']] = meta
+        df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
+        df_videos['label'] = df_videos['class'].map(
+            lambda x: True if x == 'Celeb-synthesis' else False)  # True is FAKE, False is REAL
+        df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').name)
+        df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
+        df_videos.loc[(df_videos['label'] == True), 'original'] = \
+            df_videos[(df_videos['label'] == True)]['name'].map(
+                lambda x: df_videos.index[
+                    np.flatnonzero(df_videos['name'] == '_'.join([x.split('_')[0], x.split('_')[2]]))[0]]
+            )
+        df_videos['test'] = df_videos['path'].map(str).isin(test_videos_df.index)
+        print('Saving video DataFrame to {}'.format(videodataset_path))
+        df_videos.to_pickle(str(videodataset_path))
+    print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
+    print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
+if __name__ == '__main__':
+    main()

models/icpr2020dfdc/index_dfdc.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Index the official Kaggle training dataset and prepares a train and validation set based on folders
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import sys
+import argparse
+from multiprocessing import Pool
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from isplutils.utils import extract_meta_av
+def parse_args(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=Path, help='Source dir', required=True)
+    parser.add_argument('--videodataset', type=Path, default='data/dfdc_videos.pkl',
+                        help='Path to save the videos DataFrame')
+    parser.add_argument('--batch', type=int, help='Batch size', default=64)
+    return parser.parse_args(argv)
+def main(argv):
+    ## Parameters parsing
+    args = parse_args(argv)
+    source_dir: Path = args.source
+    videodataset_path: Path = args.videodataset
+    batch_size: int = args.batch
+    ## DataFrame
+    if videodataset_path.exists():
+        print('Loading video DataFrame')
+        df_videos = pd.read_pickle(videodataset_path)
+    else:
+        print('Creating video DataFrame')
+        # Create ouptut folder
+        videodataset_path.parent.mkdir(parents=True, exist_ok=True)
+        # Index
+        df_train_list = list()
+        for idx, json_path in enumerate(tqdm(sorted(source_dir.rglob('metadata.json')), desc='Indexing')):
+            df_tmp = pd.read_json(json_path, orient='index')
+            df_tmp['path'] = df_tmp.index.map(
+                lambda x: str(json_path.parent.relative_to(source_dir).joinpath(x)))
+            df_tmp['folder'] = int(str(json_path.parts[-2]).split('_')[-1])
+            df_train_list.append(df_tmp)
+        df_videos = pd.concat(df_train_list, axis=0, verify_integrity=True)
+        # Save space
+        del df_videos['split']
+        df_videos['label'] = df_videos['label'] == 'FAKE'
+        df_videos['original'] = df_videos['original'].astype('category')
+        df_videos['folder'] = df_videos['folder'].astype(np.uint8)
+        # Collect metadata
+        paths_arr = np.asarray(df_videos.path.map(lambda x: str(source_dir.joinpath(x))))
+        height_list = []
+        width_list = []
+        frames_list = []
+        with Pool() as pool:
+            for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos), step=batch_size), desc='Metadata'):
+                batch_res = pool.map(extract_meta_av, paths_arr[batch_idx0:batch_idx0 + batch_size])
+                for res in batch_res:
+                    height_list.append(res[0])
+                    width_list.append(res[1])
+                    frames_list.append(res[2])
+        df_videos['height'] = np.asarray(height_list, dtype=np.uint16)
+        df_videos['width'] = np.asarray(width_list, dtype=np.uint16)
+        df_videos['frames'] = np.asarray(frames_list, dtype=np.uint16)
+        print('Saving video DataFrame to {}'.format(videodataset_path))
+        df_videos.to_pickle(str(videodataset_path))
+    print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
+    print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
+if __name__ == '__main__':
+    main(sys.argv[1:])

models/icpr2020dfdc/index_ffpp.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""
+Index FaceForensics++
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+import sys
+from multiprocessing import Pool
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from isplutils.utils import extract_meta_av, extract_meta_cv
+def parse_args(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=Path, help='Source dir',
+                        default='dataset/ffpp/faceforensics')
+    parser.add_argument('--videodataset', type=Path, default='data/ffpp_videos.pkl',
+                        help='Path to save the videos DataFrame')
+    return parser.parse_args(argv)
+def main(argv):
+    ## Parameters parsing
+    args = parse_args(argv)
+    source_dir: Path = args.source
+    videodataset_path: Path = args.videodataset
+    # Create ouput folder (if doesn't exist)
+    videodataset_path.parent.mkdir(parents=True, exist_ok=True)
+    ## DataFrame
+    if videodataset_path.exists():
+        print('Loading video DataFrame')
+        df_videos = pd.read_pickle(videodataset_path)
+    else:
+        print('Creating video DataFrame')
+        ff_videos = Path(source_dir).rglob('*.mp4')
+        df_videos = pd.DataFrame(
+            {'path': [f.relative_to(source_dir) for f in ff_videos if 'mask' not in str(f) and 'raw' not in str(f)]})
+        df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
+        with Pool() as p:
+            meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
+        meta = np.stack(meta)
+        df_videos.loc[:, ['height', 'width', 'frames']] = meta
+        # Fix for videos that av cannot decode properly
+        for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
+            meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
+            df_videos.loc[idx, ['height', 'width', 'frames']] = meta
+        df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
+        df_videos['label'] = df_videos['class'].map(
+            lambda x: True if x == 'manipulated_sequences' else False)  # True is FAKE, False is REAL
+        df_videos['source'] = df_videos['path'].map(lambda x: x.parts[1]).astype('category')
+        df_videos['quality'] = df_videos['path'].map(lambda x: x.parts[2]).astype('category')
+        df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').parts[-1])
+        df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
+        df_videos.loc[(df_videos['label'] == True) & (df_videos['source'] != 'DeepFakeDetection'), 'original'] = \
+            df_videos[(df_videos['label'] == True) & (df_videos['source'] != 'DeepFakeDetection')]['name'].map(
+                lambda x: df_videos.index[np.flatnonzero(df_videos['name'] == x.split('_')[0])[0]]
+            )
+        df_videos.loc[(df_videos['label'] == True) & (df_videos['source'] == 'DeepFakeDetection'), 'original'] = \
+            df_videos[(df_videos['label'] == True) & (df_videos['source'] == 'DeepFakeDetection')]['name'].map(
+                lambda x: df_videos.index[
+                    np.flatnonzero(df_videos['name'] == x.split('_')[0] + '__' + x.split('__')[1])[0]]
+            )
+        print('Saving video DataFrame to {}'.format(videodataset_path))
+        df_videos.to_pickle(str(videodataset_path))
+    print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
+    print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
+if __name__ == '__main__':
+    main(sys.argv[1:])

models/icpr2020dfdc/isplutils/__init__.py ADDED Viewed

File without changes

models/icpr2020dfdc/isplutils/data.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import os
+from pathlib import Path
+from typing import List
+import albumentations as A
+import numpy as np
+import pandas as pd
+import torch
+from PIL import Image
+from albumentations.pytorch import ToTensorV2
+from torch.utils.data import Dataset, IterableDataset
+from .utils import extract_bb
+def load_face(record: pd.Series, root: str, size: int, scale: str, transformer: A.BasicTransform) -> torch.Tensor:
+    path = os.path.join(str(root), str(record.name))
+    autocache = size < 256 or scale == 'tight'
+    if scale in ['crop', 'scale', ]:
+        cached_path = str(Path(root).joinpath('autocache', scale, str(size), str(record.name)).with_suffix('.jpg'))
+    else:
+        # when self.scale == 'tight' the extracted face is not dependent on size
+        cached_path = str(Path(root).joinpath('autocache', scale, str(record.name)).with_suffix('.jpg'))
+    face = np.zeros((size, size, 3), dtype=np.uint8)
+    if os.path.exists(cached_path):
+        try:
+            face = Image.open(cached_path)
+            face = np.array(face)
+            if len(face.shape) != 3:
+                raise RuntimeError('Incorrect format: {}'.format(path))
+        except KeyboardInterrupt as e:
+            # We want keybord interrupts to be propagated
+            raise e
+        except (OSError, IOError) as e:
+            print('Deleting corrupted cache file: {}'.format(cached_path))
+            print(e)
+            os.unlink(cached_path)
+            face = np.zeros((size, size, 3), dtype=np.uint8)
+    if not os.path.exists(cached_path):
+        try:
+            frame = Image.open(path)
+            bb = record['left'], record['top'], record['right'], record['bottom']
+            face = extract_bb(frame, bb=bb, size=size, scale=scale)
+            if autocache:
+                os.makedirs(os.path.dirname(cached_path), exist_ok=True)
+                face.save(cached_path, quality=95, subsampling='4:4:4')
+            face = np.array(face)
+            if len(face.shape) != 3:
+                raise RuntimeError('Incorrect format: {}'.format(path))
+        except KeyboardInterrupt as e:
+            # We want keybord interrupts to be propagated
+            raise e
+        except (OSError, IOError) as e:
+            print('Error while reading: {}'.format(path))
+            print(e)
+            face = np.zeros((size, size, 3), dtype=np.uint8)
+    face = transformer(image=face)['image']
+    return face
+class FrameFaceIterableDataset(IterableDataset):
+    def __init__(self,
+                 roots: List[str],
+                 dfs: List[pd.DataFrame],
+                 size: int, scale: str,
+                 num_samples: int = -1,
+                 transformer: A.BasicTransform = ToTensorV2(),
+                 output_index: bool = False,
+                 labels_map: dict = None,
+                 seed: int = None):
+        """
+        :param roots: List of root folders for frames cache
+        :param dfs: List of DataFrames of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
+                   and 'label' column
+        :param size: face size
+        :param num_samples:
+        :param scale: Rescale the face to the given size, preserving the aspect ratio.
+                      If false crop around center to the given size
+        :param transformer:
+        :param output_index: enable output of df_frames index
+        :param labels_map: map from 'REAL' and 'FAKE' to actual labels
+        """
+        self.dfs = dfs
+        self.size = int(size)
+        self.seed0 = int(seed) if seed is not None else np.random.choice(2 ** 32)
+        # adapt indices
+        dfs_adapted = [df.copy() for df in self.dfs]
+        for df_idx, df in enumerate(dfs_adapted):
+            mi = pd.MultiIndex.from_tuples([(df_idx, key) for key in df.index], names=['df_idx', 'df_key'])
+            df.index = mi
+        # Concat
+        self.df = pd.concat(dfs_adapted, axis=0, join='inner')
+        self.df_real = self.df[self.df['label'] == 0]
+        self.df_fake = self.df[self.df['label'] == 1]
+        self.longer_set = 'real' if len(self.df_real) > len(self.df_fake) else 'fake'
+        self.num_samples = max(len(self.df_real), len(self.df_fake)) * 2
+        self.num_samples = min(self.num_samples, num_samples) if num_samples > 0 else self.num_samples
+        self.output_idx = bool(output_index)
+        self.scale = str(scale)
+        self.roots = [str(r) for r in roots]
+        self.transformer = transformer
+        self.labels_map = labels_map
+        if self.labels_map is None:
+            self.labels_map = {False: np.array([0., ]), True: np.array([1., ])}
+        else:
+            self.labels_map = dict(self.labels_map)
+    def _get_face(self, item: pd.Index) -> (torch.Tensor, torch.Tensor) or (torch.Tensor, torch.Tensor, str):
+        record = self.dfs[item[0]].loc[item[1]]
+        face = load_face(record=record,
+                         root=self.roots[item[0]],
+                         size=self.size,
+                         scale=self.scale,
+                         transformer=self.transformer)
+        label = self.labels_map[record.label]
+        if self.output_idx:
+            return face, label, record.name
+        else:
+            return face, label
+    def __len__(self):
+        return self.num_samples
+    def __iter__(self):
+        random_fake_idxs, random_real_idxs = get_iterative_real_fake_idxs(
+            df_real=self.df_real,
+            df_fake=self.df_fake,
+            num_samples=self.num_samples,
+            seed0=self.seed0
+        )
+        while len(random_fake_idxs) >= 1 and len(random_real_idxs) >= 1:
+            yield self._get_face(random_fake_idxs.pop())
+            yield self._get_face(random_real_idxs.pop())
+def get_iterative_real_fake_idxs(df_real: pd.DataFrame, df_fake: pd.DataFrame,
+                                 num_samples: int, seed0: int):
+    longer_set = 'real' if len(df_real) > len(df_fake) else 'fake'
+    worker_info = torch.utils.data.get_worker_info()
+    if worker_info is None:
+        seed = seed0
+        np.random.seed(seed)
+        worker_num_couple_samples = num_samples // 2
+        fake_idxs_portion = np.random.choice(df_fake.index, worker_num_couple_samples,
+                                             replace=longer_set == 'real')
+        real_idxs_portion = np.random.choice(df_real.index, worker_num_couple_samples,
+                                             replace=longer_set == 'fake')
+    else:
+        worker_id = worker_info.id
+        seed = seed0 + worker_id
+        np.random.seed(seed)
+        worker_num_couple_samples = (num_samples // 2) // worker_info.num_workers
+        if longer_set == 'fake':
+            fake_idxs_portion = df_fake.index[
+                                worker_id * worker_num_couple_samples:(worker_id + 1) * worker_num_couple_samples]
+            real_idxs_portion = np.random.choice(df_real.index, worker_num_couple_samples, replace=True)
+        else:
+            real_idxs_portion = df_real.index[
+                                worker_id * worker_num_couple_samples:(worker_id + 1) * worker_num_couple_samples]
+            fake_idxs_portion = np.random.choice(df_fake.index, worker_num_couple_samples,
+                                                 replace=True)
+    random_fake_idxs = list(np.random.permutation(fake_idxs_portion))
+    random_real_idxs = list(np.random.permutation(real_idxs_portion))
+    assert (len(random_fake_idxs) == len(random_real_idxs))
+    return random_fake_idxs, random_real_idxs
+class FrameFaceDatasetTest(Dataset):
+    def __init__(self, root: str, df: pd.DataFrame,
+                 size: int, scale: str,
+                 transformer: A.BasicTransform = ToTensorV2(),
+                 labels_map: dict = None,
+                 aug_transformers: List[A.BasicTransform] = None):
+        """
+        :param root: root folder for frames cache
+        :param df: DataFrame of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
+                   and 'label' column
+        :param size: face size
+        :param num_samples:
+        :param scale: Rescale the face to the given size, preserving the aspect ratio.
+                      If false crop around center to the given size
+        :param transformer:
+        :param labels_map: dcit to map df labels
+        :param aug_transformers: if not None, creates multiple copies of the same sample according to the provided augmentations
+        """
+        self.df = df
+        self.size = int(size)
+        self.scale = str(scale)
+        self.root = str(root)
+        self.transformer = transformer
+        self.aug_transformers = aug_transformers
+        self.labels_map = labels_map
+        if self.labels_map is None:
+            self.labels_map = {False: np.array([0., ]), True: np.array([1., ])}
+        else:
+            self.labels_map = dict(self.labels_map)
+    def _get_face(self, item: pd.Index) -> (torch.Tensor, torch.Tensor) or (torch.Tensor, torch.Tensor, str):
+        record = self.df.loc[item]
+        label = self.labels_map[record.label]
+        if self.aug_transformers is None:
+            face = load_face(record=record,
+                             root=self.root,
+                             size=self.size,
+                             scale=self.scale,
+                             transformer=self.transformer)
+            return face, label
+        else:
+            faces = []
+            for aug_transf in self.aug_transformers:
+                faces.append(
+                    load_face(record=record,
+                              root=self.root,
+                              size=self.size,
+                              scale=self.scale,
+                              transformer=A.Compose([aug_transf, self.transformer])
+                              ))
+            faces = torch.stack(faces)
+            return faces, label
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, item):
+        return self._get_face(self.df.index[item])

models/icpr2020dfdc/isplutils/data_siamese.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+from typing import List
+import albumentations as A
+import pandas as pd
+from albumentations.pytorch import ToTensorV2
+from .data import FrameFaceIterableDataset, get_iterative_real_fake_idxs
+class FrameFaceTripletIterableDataset(FrameFaceIterableDataset):
+    def __init__(self,
+                 roots: List[str],
+                 dfs: List[pd.DataFrame],
+                 size: int,
+                 scale: str,
+                 num_triplets: int = -1,
+                 transformer: A.BasicTransform = ToTensorV2(),
+                 seed: int = None):
+        """
+        :param roots: List of root folders for frames cache
+        :param dfs: List of DataFrames of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
+                   and 'label' column
+        :param size: face size
+        :param num_triplets: number of samples for the dataset
+        :param idxs: sampling indexes triplets (each element is a key for anchor, positive, negative)
+        :param scale: Rescale the face to the given size, preserving the aspect ratio.
+                      If false crop around center to the given size
+        :param transformer:
+        :param seed:
+        """
+        super(FrameFaceTripletIterableDataset, self).__init__(
+            roots=roots,
+            dfs=dfs,
+            size=size,
+            scale=scale,
+            num_samples=num_triplets * 3,
+            transformer=transformer,
+            seed=seed
+        )
+        self.num_triplet_couples = self.num_samples // 6
+        self.num_triplets = self.num_triplet_couples * 2
+        self.num_samples = self.num_triplets * 3
+    def __len__(self):
+        return self.num_triplets
+    def __iter__(self):
+        random_fake_idxs, random_real_idxs = get_iterative_real_fake_idxs(
+            df_real=self.df_real,
+            df_fake=self.df_fake,
+            num_samples=self.num_samples,
+            seed0=self.seed0
+        )
+        while len(random_fake_idxs) >= 3 and len(random_real_idxs) >= 3:
+            a = self._get_face(random_fake_idxs.pop())[0]
+            p = self._get_face(random_fake_idxs.pop())[0]
+            n = self._get_face(random_real_idxs.pop())[0]
+            yield a, p, n
+            a = self._get_face(random_real_idxs.pop())[0]
+            p = self._get_face(random_real_idxs.pop())[0]
+            n = self._get_face(random_fake_idxs.pop())[0]
+            yield a, p, n

models/icpr2020dfdc/isplutils/split.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from typing import List, Dict, Tuple
+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import numpy as np
+import pandas as pd
+available_datasets = [
+    'dfdc-35-5-10',
+    'ff-c23-720-140-140',
+    'ff-c23-720-140-140-5fpv',
+    'ff-c23-720-140-140-10fpv',
+    'ff-c23-720-140-140-15fpv',
+    'ff-c23-720-140-140-20fpv',
+    'ff-c23-720-140-140-25fpv',
+    'celebdf',  # just for convenience, not used in the original paper
+]
+def load_df(dfdc_df_path: str, ffpp_df_path: str, dfdc_faces_dir: str, ffpp_faces_dir: str, dataset: str) -> (pd.DataFrame, str):
+    if dataset.startswith('dfdc'):
+        df = pd.read_pickle(dfdc_df_path)
+        root = dfdc_faces_dir
+    elif dataset.startswith('ff-'):
+        df = pd.read_pickle(ffpp_df_path)
+        root = ffpp_faces_dir
+    else:
+        raise NotImplementedError('Unknown dataset: {}'.format(dataset))
+    return df, root
+def get_split_df(df: pd.DataFrame, dataset: str, split: str) -> pd.DataFrame:
+    if dataset == 'dfdc-35-5-10':
+        if split == 'train':
+            split_df = df[df['folder'].isin(range(35))]
+        elif split == 'val':
+            split_df = df[df['folder'].isin(range(35, 40))]
+        elif split == 'test':
+            split_df = df[df['folder'].isin(range(40, 50))]
+        else:
+            raise NotImplementedError('Unknown split: {}'.format(split))
+    elif dataset.startswith('ff-c23-720-140-140'):
+        # Save random state
+        st0 = np.random.get_state()
+        # Set seed for this selection only
+        np.random.seed(41)
+        # Split on original videos
+        crf = dataset.split('-')[1]
+        random_youtube_videos = np.random.permutation(
+            df[(df['source'] == 'youtube') & (df['quality'] == crf)]['video'].unique())
+        train_orig = random_youtube_videos[:720]
+        val_orig = random_youtube_videos[720:720 + 140]
+        test_orig = random_youtube_videos[720 + 140:]
+        if split == 'train':
+            split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
+        elif split == 'val':
+            split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
+        elif split == 'test':
+            split_df = pd.concat((df[df['original'].isin(test_orig)], df[df['video'].isin(test_orig)]), axis=0)
+        else:
+            raise NotImplementedError('Unknown split: {}'.format(split))
+        if dataset.endswith('fpv'):
+            fpv = int(dataset.rsplit('-', 1)[1][:-3])
+            idxs = []
+            for video in split_df['video'].unique():
+                idxs.append(np.random.choice(split_df[split_df['video'] == video].index, fpv, replace=False))
+            idxs = np.concatenate(idxs)
+            split_df = split_df.loc[idxs]
+        # Restore random state
+        np.random.set_state(st0)
+    elif dataset == 'celebdf':
+        seed = 41
+        num_real_train = 600
+        # Save random state
+        st0 = np.random.get_state()
+        # Set seed for this selection only
+        np.random.seed(seed)
+        # Split on original videos
+        random_train_val_real_videos = np.random.permutation(
+            df[(df['label'] == False) & (df['test'] == False)]['video'].unique())
+        train_orig = random_train_val_real_videos[:num_real_train]
+        val_orig = random_train_val_real_videos[num_real_train:]
+        if split == 'train':
+            split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
+        elif split == 'val':
+            split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
+        elif split == 'test':
+            split_df = df[df['test'] == True]
+        else:
+            raise NotImplementedError('Unknown split: {}'.format(split))
+        # Restore random state
+        np.random.set_state(st0)
+    else:
+        raise NotImplementedError('Unknown dataset: {}'.format(dataset))
+    return split_df
+def make_splits(dfdc_df: str, ffpp_df: str, dfdc_dir: str, ffpp_dir: str, dbs: Dict[str, List[str]]) -> Dict[str, Dict[str, Tuple[pd.DataFrame, str]]]:
+    """
+    Make split and return Dataframe and root
+    :param
+    dfdc_df: str, path to the DataFrame containing info on the faces extracted from the DFDC dataset with extract_faces.py
+    ffpp_df: str, path to the DataFrame containing info on the faces extracted from the FF++ dataset with extract_faces.py
+    dfdc_dir: str, path to the directory containing the faces extracted from the DFDC dataset with extract_faces.py
+    ffpp_dir: str, path to the directory containing the faces extracted from the FF++ dataset with extract_faces.py
+    dbs: {split_name:[split_dataset1,split_dataset2,...]}
+                Example:
+                {'train':['dfdc-35-5-15',],'val':['dfdc-35-5-15',]}
+    :return: split_dict: dictonary containing {split_name: ['train', 'val'], splitdb: List(pandas.DataFrame, str)}
+                Example:
+                {'train, 'dfdc-35-5-15': (dfdc_train_df, 'path/to/dir/of/DFDC/faces')}
+    """
+    split_dict = {}
+    full_dfs = {}
+    for split_name, split_dbs in dbs.items():
+        split_dict[split_name] = dict()
+        for split_db in split_dbs:
+            if split_db not in full_dfs:
+                full_dfs[split_db] = load_df(dfdc_df, ffpp_df, dfdc_dir, ffpp_dir, split_db)
+            full_df, root = full_dfs[split_db]
+            split_df = get_split_df(df=full_df, dataset=split_db, split=split_name)
+            split_dict[split_name][split_db] = (split_df, root)
+    return split_dict

models/icpr2020dfdc/isplutils/utils.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+from pprint import pprint
+from typing import Iterable, List
+import albumentations as A
+import cv2
+import numpy as np
+import scipy
+import torch
+from PIL import Image
+from albumentations.pytorch import ToTensorV2
+from matplotlib import pyplot as plt
+from torch import nn as nn
+from torchvision import transforms
+def extract_meta_av(path: str) -> (int, int, int):
+    """
+    Extract video height, width and number of frames to index the files
+    :param path:
+    :return:
+    """
+    import av
+    try:
+        video = av.open(path)
+        video_stream = video.streams.video[0]
+        return video_stream.height, video_stream.width, video_stream.frames
+    except av.AVError as e:
+        print('Error while reading file: {}'.format(path))
+        print(e)
+        return 0, 0, 0
+    except IndexError as e:
+        print('Error while processing file: {}'.format(path))
+        print(e)
+        return 0, 0, 0
+def extract_meta_cv(path: str) -> (int, int, int):
+    """
+    Extract video height, width and number of frames to index the files
+    :param path:
+    :return:
+    """
+    try:
+        vid = cv2.VideoCapture(path)
+        num_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
+        height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
+        return height, width, num_frames
+    except Exception as e:
+        print('Error while reading file: {}'.format(path))
+        print(e)
+        return 0, 0, 0
+def adapt_bb(frame_height: int, frame_width: int, bb_height: int, bb_width: int, left: int, top: int, right: int,
+             bottom: int) -> (
+        int, int, int, int):
+    x_ctr = (left + right) // 2
+    y_ctr = (bottom + top) // 2
+    new_top = max(y_ctr - bb_height // 2, 0)
+    new_bottom = min(new_top + bb_height, frame_height)
+    new_left = max(x_ctr - bb_width // 2, 0)
+    new_right = min(new_left + bb_width, frame_width)
+    return new_left, new_top, new_right, new_bottom
+def extract_bb(frame: Image.Image, bb: Iterable, scale: str, size: int) -> Image.Image:
+    """
+    Extract a face from a frame according to the given bounding box and scale policy
+    :param frame: Entire frame
+    :param bb: Bounding box (left,top,right,bottom) in the reference system of the frame
+    :param scale: "scale" to crop a square with size equal to the maximum between height and width of the face, then scale to size
+                  "crop" to crop a fixed square around face center,
+                  "tight" to crop face exactly at the bounding box with no scaling
+    :param size: size of the face
+    :return:
+    """
+    left, top, right, bottom = bb
+    if scale == "scale":
+        bb_width = int(right) - int(left)
+        bb_height = int(bottom) - int(top)
+        bb_to_desired_ratio = min(size / bb_height, size / bb_width) if (bb_width > 0 and bb_height > 0) else 1.
+        bb_width = int(size / bb_to_desired_ratio)
+        bb_height = int(size / bb_to_desired_ratio)
+        left, top, right, bottom = adapt_bb(frame.height, frame.width, bb_height, bb_width, left, top, right,
+                                            bottom)
+        face = frame.crop((left, top, right, bottom)).resize((size, size), Image.BILINEAR)
+    elif scale == "crop":
+        # Find the center of the bounding box and cut an area around it of height x width
+        left, top, right, bottom = adapt_bb(frame.height, frame.width, size, size, left, top, right,
+                                            bottom)
+        face = frame.crop((left, top, right, bottom))
+    elif scale == "tight":
+        left, top, right, bottom = adapt_bb(frame.height, frame.width, bottom - top, right - left, left, top, right,
+                                            bottom)
+        face = frame.crop((left, top, right, bottom))
+    else:
+        raise ValueError('Unknown scale value: {}'.format(scale))
+    return face
+def showimage(img_tensor: torch.Tensor):
+    topil = transforms.Compose([
+        transforms.Normalize(mean=[0, 0, 0, ], std=[1 / 0.229, 1 / 0.224, 1 / 0.225]),
+        transforms.Normalize(mean=[-0.485, -0.456, -0.406], std=[1, 1, 1]),
+        transforms.ToPILImage()
+    ])
+    plt.figure()
+    plt.imshow(topil(img_tensor))
+    plt.show()
+def make_train_tag(net_class: nn.Module,
+                   face_policy: str,
+                   patch_size: int,
+                   traindb: List[str],
+                   seed: int,
+                   suffix: str,
+                   debug: bool,
+                   ):
+    # Training parameters and tag
+    tag_params = dict(net=net_class.__name__,
+                      traindb='-'.join(traindb),
+                      face=face_policy,
+                      size=patch_size,
+                      seed=seed
+                      )
+    print('Parameters')
+    pprint(tag_params)
+    tag = 'debug_' if debug else ''
+    tag += '_'.join(['-'.join([key, str(tag_params[key])]) for key in tag_params])
+    if suffix is not None:
+        tag += '_' + suffix
+    print('Tag: {:s}'.format(tag))
+    return tag
+def get_transformer(face_policy: str, patch_size: int, net_normalizer: transforms.Normalize, train: bool):
+    # Transformers and traindb
+    if face_policy == 'scale':
+        # The loader crops the face isotropically then scales to a square of size patch_size_load
+        loading_transformations = [
+            A.PadIfNeeded(min_height=patch_size, min_width=patch_size,
+                          border_mode=cv2.BORDER_CONSTANT, value=0,always_apply=True),
+            A.Resize(height=patch_size,width=patch_size,always_apply=True),
+        ]
+        if train:
+            downsample_train_transformations = [
+                A.Downscale(scale_max=0.5, scale_min=0.5, p=0.5),  # replaces scaled dataset
+            ]
+        else:
+            downsample_train_transformations = []
+    elif face_policy == 'tight':
+        # The loader crops the face tightly without any scaling
+        loading_transformations = [
+            A.LongestMaxSize(max_size=patch_size, always_apply=True),
+            A.PadIfNeeded(min_height=patch_size, min_width=patch_size,
+                          border_mode=cv2.BORDER_CONSTANT, value=0,always_apply=True),
+        ]
+        if train:
+            downsample_train_transformations = [
+                A.Downscale(scale_max=0.5, scale_min=0.5, p=0.5),  # replaces scaled dataset
+            ]
+        else:
+            downsample_train_transformations = []
+    else:
+        raise ValueError('Unknown value for face_policy: {}'.format(face_policy))
+    if train:
+        aug_transformations = [
+            A.Compose([
+                A.HorizontalFlip(),
+                A.OneOf([
+                    A.RandomBrightnessContrast(),
+                    A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=30, val_shift_limit=20),
+                ]),
+                A.OneOf([
+                    A.ISONoise(),
+                    A.IAAAdditiveGaussianNoise(scale=(0.01 * 255, 0.03 * 255)),
+                ]),
+                A.Downscale(scale_min=0.7, scale_max=0.9, interpolation=cv2.INTER_LINEAR),
+                A.ImageCompression(quality_lower=50, quality_upper=99),
+            ], )
+        ]
+    else:
+        aug_transformations = []
+    # Common final transformations
+    final_transformations = [
+        A.Normalize(mean=net_normalizer.mean, std=net_normalizer.std, ),
+        ToTensorV2(),
+    ]
+    transf = A.Compose(
+        loading_transformations + downsample_train_transformations + aug_transformations + final_transformations)
+    return transf
+def aggregate(x, deadzone: float, pre_mult: float, policy: str, post_mult: float, clipmargin: float, params={}):
+    x = x.copy()
+    if deadzone > 0:
+        x = x[(x > deadzone) | (x < -deadzone)]
+        if len(x) == 0:
+            x = np.asarray([0, ])
+    if policy == 'mean':
+        x = np.mean(x)
+        x = scipy.special.expit(x * pre_mult)
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'sigmean':
+        x = scipy.special.expit(x * pre_mult).mean()
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'meanp':
+        pow_coeff = params.pop('p', 3)
+        x = np.mean(np.sign(x) * (np.abs(x) ** pow_coeff))
+        x = np.sign(x) * (np.abs(x) ** (1 / pow_coeff))
+        x = scipy.special.expit(x * pre_mult)
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'median':
+        x = scipy.special.expit(np.median(x) * pre_mult)
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'sigmedian':
+        x = np.median(scipy.special.expit(x * pre_mult))
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'maxabs':
+        x = np.min(x) if abs(np.min(x)) > abs(np.max(x)) else np.max(x)
+        x = scipy.special.expit(x * pre_mult)
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'avgvoting':
+        x = np.mean(np.sign(x))
+        x = (x * post_mult + 1) / 2
+    elif policy == 'voting':
+        x = np.sign(np.mean(x * pre_mult))
+        x = (x - 0.5) * post_mult + 0.5
+    else:
+        raise NotImplementedError()
+    return np.clip(x, clipmargin, 1 - clipmargin)

models/icpr2020dfdc/test_model.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+import gc
+from collections import OrderedDict
+from pathlib import Path
+import albumentations as A
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from architectures import fornet
+from architectures.fornet import FeatureExtractor
+from isplutils import utils, split
+from isplutils.data import FrameFaceDatasetTest
+def main():
+    # Args
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--testsets', type=str, help='Testing datasets', nargs='+', choices=split.available_datasets,
+                        required=True)
+    parser.add_argument('--testsplits', type=str, help='Test split', nargs='+', default=['val', 'test'],
+                        choices=['train', 'val', 'test'])
+    parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
+                        help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
+                             'Required for training/validating on the DFDC dataset.')
+    parser.add_argument('--dfdc_faces_dir', type=str, action='store',
+                        help='Path to the directory containing the faces extracted from the DFDC dataset. '
+                             'Required for training/validating on the DFDC dataset.')
+    parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
+                        help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
+                             'Required for training/validating on the FF++ dataset.')
+    parser.add_argument('--ffpp_faces_dir', type=str, action='store',
+                        help='Path to the directory containing the faces extracted from the FF++ dataset. '
+                             'Required for training/validating on the FF++ dataset.')
+    # Specify trained model path
+    parser.add_argument('--model_path', type=Path, help='Full path of the trained model', required=True)
+    # Common params
+    parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=128)
+    parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
+    parser.add_argument('--device', type=int, help='GPU id', default=0)
+    parser.add_argument('--debug', action='store_true', help='Debug flag', )
+    parser.add_argument('--num_video', type=int, help='Number of real-fake videos to test')
+    parser.add_argument('--results_dir', type=Path, help='Output folder',
+                        default='results/')
+    parser.add_argument('--override', action='store_true', help='Override existing results', )
+    args = parser.parse_args()
+    device = torch.device('cuda:{}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
+    num_workers: int = args.workers
+    batch_size: int = args.batch
+    max_num_videos_per_label: int = args.num_video  # number of real-fake videos to test
+    model_path: Path = args.model_path
+    results_dir: Path = args.results_dir
+    debug: bool = args.debug
+    override: bool = args.override
+    test_sets = args.testsets
+    test_splits = args.testsplits
+    dfdc_df_path = args.dfdc_faces_df_path
+    ffpp_df_path = args.ffpp_faces_df_path
+    dfdc_faces_dir = args.dfdc_faces_dir
+    ffpp_faces_dir = args.ffpp_faces_dir
+    # get arguments from the model path
+    face_policy = str(model_path).split('face-')[1].split('_')[0]
+    patch_size = int(str(model_path).split('size-')[1].split('_')[0])
+    net_name = str(model_path).split('net-')[1].split('_')[0]
+    model_name = '_'.join(model_path.with_suffix('').parts[-2:])
+    # Load net
+    net_class = getattr(fornet, net_name)
+    # load model
+    print('Loading model...')
+    state_tmp = torch.load(model_path, map_location='cpu')
+    if 'net' not in state_tmp.keys():
+        state = OrderedDict({'net': OrderedDict()})
+        [state['net'].update({'model.{}'.format(k): v}) for k, v in state_tmp.items()]
+    else:
+        state = state_tmp
+    net: FeatureExtractor = net_class().eval().to(device)
+    incomp_keys = net.load_state_dict(state['net'], strict=True)
+    print(incomp_keys)
+    print('Model loaded!')
+    # val loss per-frame
+    criterion = nn.BCEWithLogitsLoss(reduction='none')
+    # Define data transformers
+    test_transformer = utils.get_transformer(face_policy, patch_size, net.get_normalizer(), train=False)
+    # datasets and dataloaders (from train_binclass.py)
+    print('Loading data...')
+    # Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
+    for dataset in test_sets:
+        if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for DFDC faces for testing!')
+        elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for FF++ faces for testing!')
+    splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir,
+                               ffpp_dir=ffpp_faces_dir, dbs={'train': test_sets, 'val': test_sets, 'test': test_sets})
+    train_dfs = [splits['train'][db][0] for db in splits['train']]
+    train_roots = [splits['train'][db][1] for db in splits['train']]
+    val_roots = [splits['val'][db][1] for db in splits['val']]
+    val_dfs = [splits['val'][db][0] for db in splits['val']]
+    test_dfs = [splits['test'][db][0] for db in splits['test']]
+    test_roots = [splits['test'][db][1] for db in splits['test']]
+    # Output paths
+    out_folder = results_dir.joinpath(model_name)
+    out_folder.mkdir(mode=0o775, parents=True, exist_ok=True)
+    # Samples selection
+    if max_num_videos_per_label and max_num_videos_per_label > 0:
+        dfs_out_train = [select_videos(df, max_num_videos_per_label) for df in train_dfs]
+        dfs_out_val = [select_videos(df, max_num_videos_per_label) for df in val_dfs]
+        dfs_out_test = [select_videos(df, max_num_videos_per_label) for df in test_dfs]
+    else:
+        dfs_out_train = train_dfs
+        dfs_out_val = val_dfs
+        dfs_out_test = test_dfs
+    # Extractions list
+    extr_list = []
+    # Append train and validation set first
+    if 'train' in test_splits:
+        for idx, dataset in enumerate(test_sets):
+            extr_list.append(
+                (dfs_out_train[idx], out_folder.joinpath(dataset + '_train.pkl'), train_roots[idx], dataset + ' TRAIN')
+            )
+    if 'val' in test_splits:
+        for idx, dataset in enumerate(test_sets):
+            extr_list.append(
+                (dfs_out_val[idx], out_folder.joinpath(dataset + '_val.pkl'), val_roots[idx], dataset + ' VAL')
+            )
+    if 'test' in test_splits:
+        for idx, dataset in enumerate(test_sets):
+            extr_list.append(
+                (dfs_out_test[idx], out_folder.joinpath(dataset + '_test.pkl'), test_roots[idx], dataset + ' TEST')
+            )
+    for df, df_path, df_root, tag in extr_list:
+        if override or not df_path.exists():
+            print('\n##### PREDICT VIDEOS FROM {} #####'.format(tag))
+            print('Real frames: {}'.format(sum(df['label'] == False)))
+            print('Fake frames: {}'.format(sum(df['label'] == True)))
+            print('Real videos: {}'.format(df[df['label'] == False]['video'].nunique()))
+            print('Fake videos: {}'.format(df[df['label'] == True]['video'].nunique()))
+            dataset_out = process_dataset(root=df_root, df=df, net=net, criterion=criterion,
+                                          patch_size=patch_size,
+                                          face_policy=face_policy, transformer=test_transformer,
+                                          batch_size=batch_size,
+                                          num_workers=num_workers, device=device, )
+            df['score'] = dataset_out['score'].astype(np.float32)
+            df['loss'] = dataset_out['loss'].astype(np.float32)
+            print('Saving results to: {}'.format(df_path))
+            df.to_pickle(str(df_path))
+            if debug:
+                plt.figure()
+                plt.title(tag)
+                plt.hist(df[df.label == True].score, bins=100, alpha=0.6, label='FAKE frames')
+                plt.hist(df[df.label == False].score, bins=100, alpha=0.6, label='REAL frames')
+                plt.legend()
+            del (dataset_out)
+            del (df)
+            gc.collect()
+    if debug:
+        plt.show()
+    print('Completed!')
+def process_dataset(df: pd.DataFrame,
+                    root: str,
+                    net: FeatureExtractor,
+                    criterion,
+                    patch_size: int,
+                    face_policy: str,
+                    transformer: A.BasicTransform,
+                    batch_size: int,
+                    num_workers: int,
+                    device: torch.device,
+                    ) -> dict:
+    if isinstance(device, (int, str)):
+        device = torch.device(device)
+    dataset = FrameFaceDatasetTest(
+        root=root,
+        df=df,
+        size=patch_size,
+        scale=face_policy,
+        transformer=transformer,
+    )
+    # Preallocate
+    score = np.zeros(len(df))
+    loss = np.zeros(len(df))
+    loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, drop_last=False)
+    with torch.no_grad():
+        idx0 = 0
+        for batch_data in tqdm(loader):
+            batch_images = batch_data[0].to(device)
+            batch_labels = batch_data[1].to(device)
+            batch_samples = len(batch_images)
+            batch_out = net(batch_images)
+            batch_loss = criterion(batch_out, batch_labels)
+            score[idx0:idx0 + batch_samples] = batch_out.cpu().numpy()[:, 0]
+            loss[idx0:idx0 + batch_samples] = batch_loss.cpu().numpy()[:, 0]
+            idx0 += batch_samples
+    out_dict = {'score': score, 'loss': loss}
+    return out_dict
+def select_videos(df: pd.DataFrame, max_videos_per_label: int) -> pd.DataFrame:
+    """
+    Select up to a maximum number of videos
+    :param df: DataFrame of frames. Required columns: 'video','label'
+    :param max_videos_per_label: maximum number of real and fake videos
+    :return: DataFrame of selected frames
+    """
+    # Save random state
+    st0 = np.random.get_state()
+    # Set seed for this selection only
+    np.random.seed(42)
+    df_fake = df[df.label == True]
+    fake_videos = df_fake['video'].unique()
+    selected_fake_videos = np.random.choice(fake_videos, min(max_videos_per_label, len(fake_videos)), replace=False)
+    df_selected_fake_frames = df_fake[df_fake['video'].isin(selected_fake_videos)]
+    df_real = df[df.label == False]
+    real_videos = df_real['video'].unique()
+    selected_real_videos = np.random.choice(real_videos, min(max_videos_per_label, len(real_videos)), replace=False)
+    df_selected_real_frames = df_real[df_real['video'].isin(selected_real_videos)]
+    # Restore random state
+    np.random.set_state(st0)
+    return pd.concat((df_selected_fake_frames, df_selected_real_frames), axis=0, verify_integrity=True).copy()
+if __name__ == '__main__':
+    main()

models/icpr2020dfdc/train_binclass.py ADDED Viewed

	@@ -0,0 +1,460 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+import os
+import shutil
+import warnings
+import albumentations as A
+import numpy as np
+import pandas as pd
+import torch
+import torch.multiprocessing
+from torchvision.transforms import ToPILImage, ToTensor
+from isplutils import utils, split
+torch.multiprocessing.set_sharing_strategy('file_system')
+import torch.nn as nn
+from albumentations.pytorch import ToTensorV2
+from sklearn.metrics import roc_auc_score
+from tensorboardX import SummaryWriter
+from torch import optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from PIL import ImageChops, Image
+from architectures import fornet
+from isplutils.data import FrameFaceIterableDataset, load_face
+def main():
+    # Args
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--net', type=str, help='Net model class', required=True)
+    parser.add_argument('--traindb', type=str, help='Training datasets', nargs='+', choices=split.available_datasets,
+                        required=True)
+    parser.add_argument('--valdb', type=str, help='Validation datasets', nargs='+', choices=split.available_datasets,
+                        required=True)
+    parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
+                        help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
+                             'Required for training/validating on the DFDC dataset.')
+    parser.add_argument('--dfdc_faces_dir', type=str, action='store',
+                        help='Path to the directory containing the faces extracted from the DFDC dataset. '
+                             'Required for training/validating on the DFDC dataset.')
+    parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
+                        help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
+                             'Required for training/validating on the FF++ dataset.')
+    parser.add_argument('--ffpp_faces_dir', type=str, action='store',
+                        help='Path to the directory containing the faces extracted from the FF++ dataset. '
+                             'Required for training/validating on the FF++ dataset.')
+    parser.add_argument('--face', type=str, help='Face crop or scale', required=True,
+                        choices=['scale', 'tight'])
+    parser.add_argument('--size', type=int, help='Train patch size', required=True)
+    parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=32)
+    parser.add_argument('--lr', type=float, default=1e-5, help='Learning rate')
+    parser.add_argument('--valint', type=int, help='Validation interval (iterations)', default=500)
+    parser.add_argument('--patience', type=int, help='Patience before dropping the LR [validation intervals]',
+                        default=10)
+    parser.add_argument('--maxiter', type=int, help='Maximum number of iterations', default=20000)
+    parser.add_argument('--init', type=str, help='Weight initialization file')
+    parser.add_argument('--scratch', action='store_true', help='Train from scratch')
+    parser.add_argument('--trainsamples', type=int, help='Limit the number of train samples per epoch', default=-1)
+    parser.add_argument('--valsamples', type=int, help='Limit the number of validation samples per epoch',
+                        default=6000)
+    parser.add_argument('--logint', type=int, help='Training log interval (iterations)', default=100)
+    parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
+    parser.add_argument('--device', type=int, help='GPU device id', default=0)
+    parser.add_argument('--seed', type=int, help='Random seed', default=0)
+    parser.add_argument('--debug', action='store_true', help='Activate debug')
+    parser.add_argument('--suffix', type=str, help='Suffix to default tag')
+    parser.add_argument('--attention', action='store_true',
+                        help='Enable Tensorboard log of attention masks')
+    parser.add_argument('--log_dir', type=str, help='Directory for saving the training logs',
+                        default='runs/binclass/')
+    parser.add_argument('--models_dir', type=str, help='Directory for saving the models weights',
+                        default='weights/binclass/')
+    args = parser.parse_args()
+    # Parse arguments
+    net_class = getattr(fornet, args.net)
+    train_datasets = args.traindb
+    val_datasets = args.valdb
+    dfdc_df_path = args.dfdc_faces_df_path
+    ffpp_df_path = args.ffpp_faces_df_path
+    dfdc_faces_dir = args.dfdc_faces_dir
+    ffpp_faces_dir = args.ffpp_faces_dir
+    face_policy = args.face
+    face_size = args.size
+    batch_size = args.batch
+    initial_lr = args.lr
+    validation_interval = args.valint
+    patience = args.patience
+    max_num_iterations = args.maxiter
+    initial_model = args.init
+    train_from_scratch = args.scratch
+    max_train_samples = args.trainsamples
+    max_val_samples = args.valsamples
+    log_interval = args.logint
+    num_workers = args.workers
+    device = torch.device('cuda:{:d}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
+    seed = args.seed
+    debug = args.debug
+    suffix = args.suffix
+    enable_attention = args.attention
+    weights_folder = args.models_dir
+    logs_folder = args.log_dir
+    # Random initialization
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+    # Load net
+    net: nn.Module = net_class().to(device)
+    # Loss and optimizers
+    criterion = nn.BCEWithLogitsLoss()
+    min_lr = initial_lr * 1e-5
+    optimizer = optim.Adam(net.get_trainable_parameters(), lr=initial_lr)
+    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer=optimizer,
+        mode='min',
+        factor=0.1,
+        patience=patience,
+        cooldown=2 * patience,
+        min_lr=min_lr,
+    )
+    tag = utils.make_train_tag(net_class=net_class,
+                               traindb=train_datasets,
+                               face_policy=face_policy,
+                               patch_size=face_size,
+                               seed=seed,
+                               suffix=suffix,
+                               debug=debug,
+                               )
+    # Model checkpoint paths
+    bestval_path = os.path.join(weights_folder, tag, 'bestval.pth')
+    last_path = os.path.join(weights_folder, tag, 'last.pth')
+    periodic_path = os.path.join(weights_folder, tag, 'it{:06d}.pth')
+    os.makedirs(os.path.join(weights_folder, tag), exist_ok=True)
+    # Load model
+    val_loss = min_val_loss = 10
+    epoch = iteration = 0
+    net_state = None
+    opt_state = None
+    if initial_model is not None:
+        # If given load initial model
+        print('Loading model form: {}'.format(initial_model))
+        state = torch.load(initial_model, map_location='cpu')
+        net_state = state['net']
+    elif not train_from_scratch and os.path.exists(last_path):
+        print('Loading model form: {}'.format(last_path))
+        state = torch.load(last_path, map_location='cpu')
+        net_state = state['net']
+        opt_state = state['opt']
+        iteration = state['iteration'] + 1
+        epoch = state['epoch']
+    if not train_from_scratch and os.path.exists(bestval_path):
+        state = torch.load(bestval_path, map_location='cpu')
+        min_val_loss = state['val_loss']
+    if net_state is not None:
+        incomp_keys = net.load_state_dict(net_state, strict=False)
+        print(incomp_keys)
+    if opt_state is not None:
+        for param_group in opt_state['param_groups']:
+            param_group['lr'] = initial_lr
+        optimizer.load_state_dict(opt_state)
+    # Initialize Tensorboard
+    logdir = os.path.join(logs_folder, tag)
+    if iteration == 0:
+        # If training from scratch or initialization remove history if exists
+        shutil.rmtree(logdir, ignore_errors=True)
+    # TensorboardX instance
+    tb = SummaryWriter(logdir=logdir)
+    if iteration == 0:
+        dummy = torch.randn((1, 3, face_size, face_size), device=device)
+        dummy = dummy.to(device)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            tb.add_graph(net, [dummy, ], verbose=False)
+    transformer = utils.get_transformer(face_policy=face_policy, patch_size=face_size,
+                                        net_normalizer=net.get_normalizer(), train=True)
+    # Datasets and data loaders
+    print('Loading data')
+    # Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
+    for dataset in train_datasets:
+        if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for DFDC faces for training!')
+        elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for FF++ faces for training!')
+    for dataset in val_datasets:
+        if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for DFDC faces for validation!')
+        elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for FF++ faces for validation!')
+    # Load splits with the make_splits function
+    splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir, ffpp_dir=ffpp_faces_dir,
+                               dbs={'train': train_datasets, 'val': val_datasets})
+    train_dfs = [splits['train'][db][0] for db in splits['train']]
+    train_roots = [splits['train'][db][1] for db in splits['train']]
+    val_roots = [splits['val'][db][1] for db in splits['val']]
+    val_dfs = [splits['val'][db][0] for db in splits['val']]
+    train_dataset = FrameFaceIterableDataset(roots=train_roots,
+                                             dfs=train_dfs,
+                                             scale=face_policy,
+                                             num_samples=max_train_samples,
+                                             transformer=transformer,
+                                             size=face_size,
+                                             )
+    val_dataset = FrameFaceIterableDataset(roots=val_roots,
+                                           dfs=val_dfs,
+                                           scale=face_policy,
+                                           num_samples=max_val_samples,
+                                           transformer=transformer,
+                                           size=face_size,
+                                           )
+    train_loader = DataLoader(train_dataset, num_workers=num_workers, batch_size=batch_size, )
+    val_loader = DataLoader(val_dataset, num_workers=num_workers, batch_size=batch_size, )
+    print('Training samples: {}'.format(len(train_dataset)))
+    print('Validation samples: {}'.format(len(val_dataset)))
+    if len(train_dataset) == 0:
+        print('No training samples. Halt.')
+        return
+    if len(val_dataset) == 0:
+        print('No validation samples. Halt.')
+        return
+    stop = False
+    while not stop:
+        # Training
+        optimizer.zero_grad()
+        train_loss = train_num = 0
+        train_pred_list = []
+        train_labels_list = []
+        for train_batch in tqdm(train_loader, desc='Epoch {:03d}'.format(epoch), leave=False,
+                                total=len(train_loader) // train_loader.batch_size):
+            net.train()
+            batch_data, batch_labels = train_batch
+            train_batch_num = len(batch_labels)
+            train_num += train_batch_num
+            train_labels_list.append(batch_labels.numpy().flatten())
+            train_batch_loss, train_batch_pred = batch_forward(net, device, criterion, batch_data, batch_labels)
+            train_pred_list.append(train_batch_pred.flatten())
+            if torch.isnan(train_batch_loss):
+                raise ValueError('NaN loss')
+            train_loss += train_batch_loss.item() * train_batch_num
+            # Optimization
+            train_batch_loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            # Logging
+            if iteration > 0 and (iteration % log_interval == 0):
+                train_loss /= train_num
+                tb.add_scalar('train/loss', train_loss, iteration)
+                tb.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration)
+                tb.add_scalar('epoch', epoch, iteration)
+                # Checkpoint
+                save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, last_path)
+                train_loss = train_num = 0
+            # Validation
+            if iteration > 0 and (iteration % validation_interval == 0):
+                # Model checkpoint
+                save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch,
+                           periodic_path.format(iteration))
+                # Train cumulative stats
+                train_labels = np.concatenate(train_labels_list)
+                train_pred = np.concatenate(train_pred_list)
+                train_labels_list = []
+                train_pred_list = []
+                train_roc_auc = roc_auc_score(train_labels, train_pred)
+                tb.add_scalar('train/roc_auc', train_roc_auc, iteration)
+                tb.add_pr_curve('train/pr', train_labels, train_pred, iteration)
+                # Validation
+                val_loss = validation_routine(net, device, val_loader, criterion, tb, iteration, 'val')
+                tb.flush()
+                # LR Scheduler
+                lr_scheduler.step(val_loss)
+                # Model checkpoint
+                if val_loss < min_val_loss:
+                    min_val_loss = val_loss
+                    save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, bestval_path)
+                # Attention
+                if enable_attention and hasattr(net, 'get_attention'):
+                    net.eval()
+                    # For each dataframe show the attention for a real,fake couple of frames
+                    for df, root, sample_idx, tag in [
+                        (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == False].index[0],
+                         'train/att/real'),
+                        (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == True].index[0],
+                         'train/att/fake'),
+                    ]:
+                        record = df.loc[sample_idx]
+                        tb_attention(tb, tag, iteration, net, device, face_size, face_policy,
+                                     transformer, root, record)
+                if optimizer.param_groups[0]['lr'] == min_lr:
+                    print('Reached minimum learning rate. Stopping.')
+                    stop = True
+                    break
+            iteration += 1
+            if iteration > max_num_iterations:
+                print('Maximum number of iterations reached')
+                stop = True
+                break
+            # End of iteration
+        epoch += 1
+    # Needed to flush out last events
+    tb.close()
+    print('Completed')
+def tb_attention(tb: SummaryWriter,
+                 tag: str,
+                 iteration: int,
+                 net: nn.Module,
+                 device: torch.device,
+                 patch_size_load: int,
+                 face_crop_scale: str,
+                 val_transformer: A.BasicTransform,
+                 root: str,
+                 record: pd.Series,
+                 ):
+    # Crop face
+    sample_t = load_face(record=record, root=root, size=patch_size_load, scale=face_crop_scale,
+                         transformer=val_transformer)
+    sample_t_clean = load_face(record=record, root=root, size=patch_size_load, scale=face_crop_scale,
+                               transformer=ToTensorV2())
+    if torch.cuda.is_available():
+        sample_t = sample_t.cuda(device)
+    # Transform
+    # Feed to net
+    with torch.no_grad():
+        att: torch.Tensor = net.get_attention(sample_t.unsqueeze(0))[0].cpu()
+    att_img: Image.Image = ToPILImage()(att)
+    sample_img = ToPILImage()(sample_t_clean)
+    att_img = att_img.resize(sample_img.size, resample=Image.NEAREST).convert('RGB')
+    sample_att_img = ImageChops.multiply(sample_img, att_img)
+    sample_att = ToTensor()(sample_att_img)
+    tb.add_image(tag=tag, img_tensor=sample_att, global_step=iteration)
+def batch_forward(net: nn.Module, device: torch.device, criterion, data: torch.Tensor, labels: torch.Tensor) -> (
+        torch.Tensor, float, int):
+    data = data.to(device)
+    labels = labels.to(device)
+    out = net(data)
+    pred = torch.sigmoid(out).detach().cpu().numpy()
+    loss = criterion(out, labels)
+    return loss, pred
+def validation_routine(net, device, val_loader, criterion, tb, iteration, tag: str, loader_len_norm: int = None):
+    net.eval()
+    loader_len_norm = loader_len_norm if loader_len_norm is not None else val_loader.batch_size
+    val_num = 0
+    val_loss = 0.
+    pred_list = list()
+    labels_list = list()
+    for val_data in tqdm(val_loader, desc='Validation', leave=False, total=len(val_loader) // loader_len_norm):
+        batch_data, batch_labels = val_data
+        val_batch_num = len(batch_labels)
+        labels_list.append(batch_labels.flatten())
+        with torch.no_grad():
+            val_batch_loss, val_batch_pred = batch_forward(net, device, criterion, batch_data,
+                                                           batch_labels)
+        pred_list.append(val_batch_pred.flatten())
+        val_num += val_batch_num
+        val_loss += val_batch_loss.item() * val_batch_num
+    # Logging
+    val_loss /= val_num
+    tb.add_scalar('{}/loss'.format(tag), val_loss, iteration)
+    if isinstance(criterion, nn.BCEWithLogitsLoss):
+        val_labels = np.concatenate(labels_list)
+        val_pred = np.concatenate(pred_list)
+        val_roc_auc = roc_auc_score(val_labels, val_pred)
+        tb.add_scalar('{}/roc_auc'.format(tag), val_roc_auc, iteration)
+        tb.add_pr_curve('{}/pr'.format(tag), val_labels, val_pred, iteration)
+    return val_loss
+def save_model(net: nn.Module, optimizer: optim.Optimizer,
+               train_loss: float, val_loss: float,
+               iteration: int, batch_size: int, epoch: int,
+               path: str):
+    path = str(path)
+    state = dict(net=net.state_dict(),
+                 opt=optimizer.state_dict(),
+                 train_loss=train_loss,
+                 val_loss=val_loss,
+                 iteration=iteration,
+                 batch_size=batch_size,
+                 epoch=epoch)
+    torch.save(state, path)
+if __name__ == '__main__':
+    main()

models/icpr2020dfdc/train_triplet.py ADDED Viewed

	@@ -0,0 +1,459 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+import os
+import shutil
+import warnings
+import numpy as np
+import torch
+import torch.multiprocessing
+torch.multiprocessing.set_sharing_strategy('file_system')
+import torch.nn as nn
+import torch.optim as optim
+from tensorboardX import SummaryWriter
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from architectures import tripletnet
+from train_binclass import save_model, tb_attention
+from isplutils.data import FrameFaceIterableDataset
+from isplutils.data_siamese import FrameFaceTripletIterableDataset
+from isplutils import split, utils
+def main():
+    # Args
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--net', type=str, help='Net model class', required=True)
+    parser.add_argument('--traindb', type=str, help='Training datasets', nargs='+', choices=split.available_datasets,
+                        required=True)
+    parser.add_argument('--valdb', type=str, help='Validation datasets', nargs='+', choices=split.available_datasets,
+                        required=True)
+    parser.add_argument('--dfdc_faces_df_path', type=str, action='store',
+                        help='Path to the Pandas Dataframe obtained from extract_faces.py on the DFDC dataset. '
+                             'Required for training/validating on the DFDC dataset.')
+    parser.add_argument('--dfdc_faces_dir', type=str, action='store',
+                        help='Path to the directory containing the faces extracted from the DFDC dataset. '
+                             'Required for training/validating on the DFDC dataset.')
+    parser.add_argument('--ffpp_faces_df_path', type=str, action='store',
+                        help='Path to the Pandas Dataframe obtained from extract_faces.py on the FF++ dataset. '
+                             'Required for training/validating on the FF++ dataset.')
+    parser.add_argument('--ffpp_faces_dir', type=str, action='store',
+                        help='Path to the directory containing the faces extracted from the FF++ dataset. '
+                             'Required for training/validating on the FF++ dataset.')
+    parser.add_argument('--face', type=str, help='Face crop or scale', required=True,
+                        choices=['scale', 'tight'])
+    parser.add_argument('--size', type=int, help='Train patch size', required=True)
+    parser.add_argument('--batch', type=int, help='Batch size to fit in GPU memory', default=12)
+    parser.add_argument('--lr', type=float, default=1e-5, help='Learning rate')
+    parser.add_argument('--valint', type=int, help='Validation interval (iterations)', default=500)
+    parser.add_argument('--patience', type=int, help='Patience before dropping the LR [validation intervals]',
+                        default=10)
+    parser.add_argument('--maxiter', type=int, help='Maximum number of iterations', default=20000)
+    parser.add_argument('--init', type=str, help='Weight initialization file')
+    parser.add_argument('--scratch', action='store_true', help='Train from scratch')
+    parser.add_argument('--traintriplets', type=int, help='Limit the number of train triplets per epoch', default=-1)
+    parser.add_argument('--valtriplets', type=int, help='Limit the number of validation triplets per epoch',
+                        default=2000)
+    parser.add_argument('--logint', type=int, help='Training log interval (iterations)', default=100)
+    parser.add_argument('--workers', type=int, help='Num workers for data loaders', default=6)
+    parser.add_argument('--device', type=int, help='GPU device id', default=0)
+    parser.add_argument('--seed', type=int, help='Random seed', default=0)
+    parser.add_argument('--debug', action='store_true', help='Activate debug')
+    parser.add_argument('--suffix', type=str, help='Suffix to default tag')
+    parser.add_argument('--attention', action='store_true',
+                        help='Enable Tensorboard log of attention masks')
+    parser.add_argument('--embedding', action='store_true', help='Activate embedding visualization in TensorBoard')
+    parser.add_argument('--embeddingint', type=int, help='Embedding visualization interval in TensorBoard',
+                        default=5000)
+    parser.add_argument('--log_dir', type=str, help='Directory for saving the training logs',
+                        default='runs/triplet/')
+    parser.add_argument('--models_dir', type=str, help='Directory for saving the models weights',
+                        default='weights/triplet/')
+    args = parser.parse_args()
+    # Parse arguments
+    net_class = getattr(tripletnet, args.net)
+    train_datasets = args.traindb
+    val_datasets = args.valdb
+    dfdc_df_path = args.dfdc_faces_df_path
+    ffpp_df_path = args.ffpp_faces_df_path
+    dfdc_faces_dir = args.dfdc_faces_dir
+    ffpp_faces_dir = args.ffpp_faces_dir
+    face_policy = args.face
+    face_size = args.size
+    batch_size = args.batch
+    initial_lr = args.lr
+    validation_interval = args.valint
+    patience = args.patience
+    max_num_iterations = args.maxiter
+    initial_model = args.init
+    train_from_scratch = args.scratch
+    max_train_triplets = args.traintriplets
+    max_val_triplets = args.valtriplets
+    log_interval = args.logint
+    num_workers = args.workers
+    device = torch.device('cuda:{:d}'.format(args.device)) if torch.cuda.is_available() else torch.device('cpu')
+    seed = args.seed
+    debug = args.debug
+    suffix = args.suffix
+    enable_attention = args.attention
+    enable_embedding = args.embedding
+    embedding_interval = args.embeddingint
+    weights_folder = args.models_dir
+    logs_folder = args.log_dir
+    # Random initialization
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+    # Load net
+    net: nn.Module = net_class().to(device)
+    # Loss and optimizers
+    criterion = nn.TripletMarginLoss()
+    min_lr = initial_lr * 1e-5
+    optimizer = optim.Adam(net.get_trainable_parameters(), lr=initial_lr)
+    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer=optimizer,
+        mode='min',
+        factor=0.1,
+        patience=patience,
+        cooldown=2 * patience,
+        min_lr=min_lr,
+    )
+    tag = utils.make_train_tag(net_class=net_class,
+                               traindb=train_datasets,
+                               face_policy=face_policy,
+                               patch_size=face_size,
+                               seed=seed,
+                               suffix=suffix,
+                               debug=debug,
+                               )
+    # Model checkpoint paths
+    bestval_path = os.path.join(weights_folder, tag, 'bestval.pth')
+    last_path = os.path.join(weights_folder, tag, 'last.pth')
+    periodic_path = os.path.join(weights_folder, tag, 'it{:06d}.pth')
+    os.makedirs(os.path.join(weights_folder, tag), exist_ok=True)
+    # Load model
+    val_loss = min_val_loss = 20
+    epoch = iteration = 0
+    net_state = None
+    opt_state = None
+    if initial_model is not None:
+        # If given load initial model
+        print('Loading model form: {}'.format(initial_model))
+        state = torch.load(initial_model, map_location='cpu')
+        net_state = state['net']
+    elif not train_from_scratch and os.path.exists(last_path):
+        print('Loading model form: {}'.format(last_path))
+        state = torch.load(last_path, map_location='cpu')
+        net_state = state['net']
+        opt_state = state['opt']
+        iteration = state['iteration'] + 1
+        epoch = state['epoch']
+    if not train_from_scratch and os.path.exists(bestval_path):
+        state = torch.load(bestval_path, map_location='cpu')
+        min_val_loss = state['val_loss']
+    if net_state is not None:
+        adapt_binclass_model(net_state)
+        incomp_keys = net.load_state_dict(net_state, strict=False)
+        print(incomp_keys)
+    if opt_state is not None:
+        for param_group in opt_state['param_groups']:
+            param_group['lr'] = initial_lr
+        optimizer.load_state_dict(opt_state)
+    # Initialize Tensorboard
+    logdir = os.path.join(logs_folder, tag)
+    if iteration == 0:
+        # If training from scratch or initialization remove history if exists
+        shutil.rmtree(logdir, ignore_errors=True)
+    # TensorboardX instance
+    tb = SummaryWriter(logdir=logdir)
+    if iteration == 0:
+        dummy = torch.randn((1, 3, face_size, face_size), device=device)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            tb.add_graph(net, [dummy, dummy, dummy], verbose=False)
+    transformer = utils.get_transformer(face_policy=face_policy, patch_size=face_size,
+                                        net_normalizer=net.get_normalizer(), train=True)
+    # Datasets and data loaders
+    print('Loading data')
+    # Check if paths for DFDC and FF++ extracted faces and DataFrames are provided
+    for dataset in train_datasets:
+        if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for DFDC faces for training!')
+        elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for FF++ faces for training!')
+    for dataset in val_datasets:
+        if dataset.split('-')[0] == 'dfdc' and (dfdc_df_path is None or dfdc_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for DFDC faces for validation!')
+        elif dataset.split('-')[0] == 'ff' and (ffpp_df_path is None or ffpp_faces_dir is None):
+            raise RuntimeError('Specify DataFrame and directory for FF++ faces for validation!')
+    splits = split.make_splits(dfdc_df=dfdc_df_path, ffpp_df=ffpp_df_path, dfdc_dir=dfdc_faces_dir,
+                               ffpp_dir=ffpp_faces_dir, dbs={'train': train_datasets, 'val': val_datasets})
+    train_dfs = [splits['train'][db][0] for db in splits['train']]
+    train_roots = [splits['train'][db][1] for db in splits['train']]
+    val_roots = [splits['val'][db][1] for db in splits['val']]
+    val_dfs = [splits['val'][db][0] for db in splits['val']]
+    train_dataset = FrameFaceTripletIterableDataset(roots=train_roots,
+                                                    dfs=train_dfs,
+                                                    scale=face_policy,
+                                                    num_triplets=max_train_triplets,
+                                                    transformer=transformer,
+                                                    size=face_size,
+                                                    )
+    val_dataset = FrameFaceTripletIterableDataset(roots=val_roots,
+                                                  dfs=val_dfs,
+                                                  scale=face_policy,
+                                                  num_triplets=max_val_triplets,
+                                                  transformer=transformer,
+                                                  size=face_size,
+                                                  )
+    train_loader = DataLoader(train_dataset, num_workers=num_workers, batch_size=batch_size, )
+    val_loader = DataLoader(val_dataset, num_workers=num_workers, batch_size=batch_size, )
+    print('Training triplets: {}'.format(len(train_dataset)))
+    print('Validation triplets: {}'.format(len(val_dataset)))
+    if len(train_dataset) == 0:
+        print('No training triplets. Halt.')
+        return
+    if len(val_dataset) == 0:
+        print('No validation triplets. Halt.')
+        return
+    # Embedding visualization
+    if enable_embedding:
+        train_dataset_embedding = FrameFaceIterableDataset(roots=train_roots,
+                                                           dfs=train_dfs,
+                                                           scale=face_policy,
+                                                           num_samples=64,
+                                                           transformer=transformer,
+                                                           size=face_size,
+                                                           )
+        train_loader_embedding = DataLoader(train_dataset_embedding, num_workers=num_workers, batch_size=batch_size, )
+        val_dataset_embedding = FrameFaceIterableDataset(roots=val_roots,
+                                                         dfs=val_dfs,
+                                                         scale=face_policy,
+                                                         num_samples=64,
+                                                         transformer=transformer,
+                                                         size=face_size,
+                                                         )
+        val_loader_embedding = DataLoader(val_dataset_embedding, num_workers=num_workers, batch_size=batch_size, )
+    else:
+        train_loader_embedding = None
+        val_loader_embedding = None
+    stop = False
+    while not stop:
+        # Training
+        optimizer.zero_grad()
+        train_loss = train_num = 0
+        for train_batch in tqdm(train_loader, desc='Epoch {:03d}'.format(epoch), leave=False,
+                                total=len(train_loader) // train_loader.batch_size):
+            net.train()
+            train_batch_num = len(train_batch[0])
+            train_num += train_batch_num
+            train_batch_loss = batch_forward(net, device, criterion, train_batch)
+            if torch.isnan(train_batch_loss):
+                raise ValueError('NaN loss')
+            train_loss += train_batch_loss.item() * train_batch_num
+            # Optimization
+            train_batch_loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            # Logging
+            if iteration > 0 and (iteration % log_interval == 0):
+                train_loss /= train_num
+                tb.add_scalar('train/loss', train_loss, iteration)
+                tb.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration)
+                tb.add_scalar('epoch', epoch, iteration)
+                # Checkpoint
+                save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch, last_path)
+                train_loss = train_num = 0
+            # Validation
+            if iteration > 0 and (iteration % validation_interval == 0):
+                # Validation
+                val_loss = validation_routine(net, device, val_loader, criterion, tb, iteration, tag='val')
+                tb.flush()
+                # LR Scheduler
+                lr_scheduler.step(val_loss)
+                # Model checkpoint
+                save_model(net, optimizer, train_loss, val_loss, iteration, batch_size, epoch,
+                           periodic_path.format(iteration))
+                if val_loss < min_val_loss:
+                    min_val_loss = val_loss
+                    shutil.copy(periodic_path.format(iteration), bestval_path)
+                # Attention
+                if enable_attention and hasattr(net, 'feat_ext') and hasattr(net.feat_ext, 'get_attention'):
+                    net.eval()
+                    # For each dataframe show the attention for a real,fake couple of frames
+                    for df, root, sample_idx, tag in [
+                        (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == False].index[0],
+                         'train/att/real'),
+                        (train_dfs[0], train_roots[0], train_dfs[0][train_dfs[0]['label'] == True].index[0],
+                         'train/att/fake'),
+                    ]:
+                        record = df.loc[sample_idx]
+                        tb_attention(tb, tag, iteration, net.feat_ext, device, face_size, face_policy,
+                                     transformer, root, record)
+                if optimizer.param_groups[0]['lr'] <= min_lr:
+                    print('Reached minimum learning rate. Stopping.')
+                    stop = True
+                    break
+            # Embedding visualization
+            if enable_embedding:
+                if iteration > 0 and (iteration % embedding_interval == 0):
+                    embedding_routine(net=net,
+                                      device=device,
+                                      loader=train_loader_embedding,
+                                      iteration=iteration,
+                                      tb=tb,
+                                      tag=tag + '/train')
+                    embedding_routine(net=net,
+                                      device=device,
+                                      loader=val_loader_embedding,
+                                      iteration=iteration,
+                                      tb=tb,
+                                      tag=tag + '/val')
+            iteration += 1
+            if iteration > max_num_iterations:
+                print('Maximum number of iterations reached')
+                stop = True
+                break
+            # End of iteration
+        epoch += 1
+    # Needed to flush out last events
+    tb.close()
+    print('Completed')
+def adapt_binclass_model(net_state):
+    # Check that the model contains at least one key starting with feat_ext, otherwise adapt
+    found = False
+    for key in net_state:
+        if key.startswith('feat_ext.'):
+            found = True
+            break
+    if not found:
+        # Adapt all keys
+        print('Adapting keys')
+        keys = [k for k in net_state]
+        for key in keys:
+            net_state['feat_ext.{}'.format(key)] = net_state[key]
+            del net_state[key]
+def batch_forward(net: nn.Module, device, criterion, data: tuple) -> torch.Tensor:
+    if torch.cuda.is_available():
+        data = [i.cuda(device) for i in data]
+    out = net(*data)
+    loss = criterion(*out)
+    return loss
+def validation_routine(net, device, val_loader, criterion, tb, iteration, tag):
+    net.eval()
+    val_num = 0
+    val_loss = 0.
+    for val_data in tqdm(val_loader, desc='Validation', leave=False, total=len(val_loader) // val_loader.batch_size):
+        val_batch_num = len(val_data[0])
+        with torch.no_grad():
+            val_batch_loss = batch_forward(net, device, criterion, val_data, )
+        val_num += val_batch_num
+        val_loss += val_batch_loss.item() * val_batch_num
+    # Logging
+    val_loss /= val_num
+    tb.add_scalar('{}/loss'.format(tag), val_loss, iteration)
+    return val_loss
+def embedding_routine(net: nn.Module, device: torch.device, loader: DataLoader, tb: SummaryWriter, iteration: int,
+                      tag: str):
+    net.eval()
+    labels = []
+    embeddings = []
+    for batch_data in loader:
+        batch_faces, batch_labels = batch_data
+        if torch.cuda.is_available():
+            batch_faces = batch_faces.to(device)
+        with torch.no_grad():
+            batch_emb = net.features(batch_faces)
+        labels.append(batch_labels.numpy().flatten())
+        embeddings.append(torch.flatten(batch_emb.cpu(), start_dim=1).numpy())
+    labels = list(np.concatenate(labels))
+    embeddings = np.concatenate(embeddings)
+    # Logging
+    tb.add_embedding(mat=embeddings, metadata=labels, tag=tag, global_step=iteration)
+if __name__ == '__main__':
+    main()

models/model_loader.py CHANGED Viewed

@@ -27,6 +27,7 @@ class ModelLoader:
                     cls._instance._face_detector = None
                     cls._instance._spacy_nlp = None
                     cls._instance._sentence_transformer = None
         return cls._instance
     @classmethod
@@ -146,6 +147,23 @@ class ModelLoader:
             logger.info("MediaPipe FaceMesh loaded")
         return self._face_detector
     # ---------- Preload ----------
     def preload_phase1(self) -> None:
         """Preload only what Phase 1 needs (image model)."""

                     cls._instance._face_detector = None
                     cls._instance._spacy_nlp = None
                     cls._instance._sentence_transformer = None
+                    cls._instance._efficientnet_detector = None
         return cls._instance
     @classmethod
             logger.info("MediaPipe FaceMesh loaded")
         return self._face_detector
+    # ---------- EfficientNetAutoAttB4 (ICPR2020 / DeepShield1 merge) ----------
+    def load_efficientnet(self):
+        """Lazy-load EfficientNetAutoAttB4 detector. Returns None if deps are missing."""
+        if self._efficientnet_detector is None:
+            try:
+                from services.efficientnet_service import EfficientNetDetector
+                self._efficientnet_detector = EfficientNetDetector(
+                    model_name=settings.EFFICIENTNET_MODEL,
+                    train_db=settings.EFFICIENTNET_TRAIN_DB,
+                    device=settings.DEVICE,
+                )
+            except Exception as e:
+                logger.warning(f"EfficientNet load failed (continuing without it): {e}")
+                return None
+        return self._efficientnet_detector
     # ---------- Preload ----------
     def preload_phase1(self) -> None:
         """Preload only what Phase 1 needs (image model)."""

requirements.txt CHANGED Viewed

@@ -11,6 +11,13 @@ alembic==1.13.3
 python-jose[cryptography]==3.3.0
 bcrypt==4.2.0
 # === Phase 1: Image Detection ===
 # Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
 torch==2.4.1

 python-jose[cryptography]==3.3.0
 bcrypt==4.2.0
+# === MERGE: EfficientNetAutoAttB4 (DeepShield1 / ICPR2020) ===
+albumentations>=1.3.0,<1.5       # Required by icpr2020dfdc isplutils transforms; pin to avoid 1.5+ API break
+scipy>=1.13.0                    # expit (sigmoid) for EfficientNet logit conversion
+# NOTE: MERGE_PLAN §4 said NOT to install efficientnet-pytorch, but fornet.py imports it directly.
+efficientnet-pytorch==0.7.1      # Required by icpr2020dfdc/architectures/fornet.py
+psutil>=5.9.0                    # RAM monitoring in smoke tests
 # === Phase 1: Image Detection ===
 # Install torch separately with CPU index first (see README): pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
 torch==2.4.1

schemas/common.py CHANGED Viewed

@@ -86,3 +86,4 @@ class ProcessingSummary(BaseModel):
     stages_completed: List[str]
     total_duration_ms: int
     model_used: str

     stages_completed: List[str]
     total_duration_ms: int
     model_used: str
+    models_used: List[str] = []  # all models that contributed (ensemble)

services/efficientnet_service.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""EfficientNetAutoAttB4 adapter — wraps ICPR2020 DFDC model into DeepShield service interface."""
+from __future__ import annotations
+import pickle
+import sys
+from pathlib import Path
+from typing import List, Optional
+import numpy as np
+import torch
+from loguru import logger
+from PIL import Image
+from scipy.special import expit
+from torch.utils.model_zoo import load_url
+# Resolve ICPR2020 repo root and patch sys.path so its modules are importable.
+_ICPR_ROOT = (Path(__file__).resolve().parent.parent / "models" / "icpr2020dfdc").resolve()
+_NOTEBOOK_DIR = str(_ICPR_ROOT / "notebook")
+if str(_ICPR_ROOT) not in sys.path:
+    sys.path.insert(0, str(_ICPR_ROOT))
+if _NOTEBOOK_DIR not in sys.path:
+    sys.path.insert(0, _NOTEBOOK_DIR)
+# These imports are valid only after the sys.path patch above.
+from blazeface import BlazeFace, FaceExtractor  # noqa: E402
+from architectures import fornet, weights  # noqa: E402
+from isplutils import utils as ispl_utils  # noqa: E402
+# Default calibrator path — populated by scripts/fit_calibrator.py.
+_CALIBRATOR_PATH = Path(__file__).resolve().parent.parent / "models" / "efficientnet_calibrator.pkl"
+def _load_calibrator(path: Path = _CALIBRATOR_PATH):
+    """Load isotonic calibrator if it exists. Returns None otherwise."""
+    if not path.exists():
+        return None
+    try:
+        with path.open("rb") as f:
+            cal = pickle.load(f)
+        logger.info(f"Isotonic calibrator loaded from {path}")
+        return cal
+    except Exception as e:
+        logger.warning(f"Failed to load calibrator ({e}) — using raw sigmoid scores")
+        return None
+class EfficientNetDetector:
+    """Thin adapter that loads EfficientNetAutoAttB4 (DFDC-trained) and exposes
+    detect_image() / detect_video_frames() matching DeepShield's service interface.
+    If backend/models/efficientnet_calibrator.pkl exists (produced by
+    scripts/fit_calibrator.py), raw sigmoid scores are passed through an isotonic
+    regression calibrator before being returned. Set calibrator=None to disable.
+    """
+    def __init__(
+        self,
+        model_name: str = "EfficientNetAutoAttB4",
+        train_db: str = "DFDC",
+        device: str = "cpu",
+        calibrator_path: Optional[Path] = None,
+    ) -> None:
+        self.device = torch.device(device)
+        self.model_name = model_name
+        self.train_db = train_db
+        weight_key = f"{model_name}_{train_db}"
+        if weight_key not in weights.weight_url:
+            raise KeyError(f"Unknown model/DB combination: {weight_key}")
+        self.net = getattr(fornet, model_name)().eval().to(self.device)
+        # check_hash=False — the ISPL mirror occasionally has stale sha256 hashes in URLs.
+        state = load_url(weights.weight_url[weight_key], map_location=self.device, check_hash=False)
+        self.net.load_state_dict(state)
+        self.transf = ispl_utils.get_transformer(
+            "scale", 224, self.net.get_normalizer(), train=False
+        )
+        blazeface_dir = _ICPR_ROOT / "blazeface"
+        weights_path = blazeface_dir / "blazeface.pth"
+        anchors_path = blazeface_dir / "anchors.npy"
+        if not weights_path.exists() or not anchors_path.exists():
+            raise FileNotFoundError(
+                f"BlazeFace assets missing: expected {weights_path} and {anchors_path}. "
+                "Ensure icpr2020dfdc is cloned into backend/models/ with its blazeface/ subdirectory."
+            )
+        self.facedet = BlazeFace().to(self.device)
+        self.facedet.load_weights(str(weights_path))
+        self.facedet.load_anchors(str(anchors_path))
+        self.face_extractor = FaceExtractor(facedet=self.facedet)
+        self.calibrator = _load_calibrator(calibrator_path or _CALIBRATOR_PATH)
+        self.calibrator_applied = self.calibrator is not None
+        logger.info(
+            f"EfficientNetDetector ready: {model_name}/{train_db} on {self.device} "
+            f"| calibrator={'yes' if self.calibrator_applied else 'no'}"
+        )
+    def _face_tensor(self, face_np: np.ndarray) -> torch.Tensor:
+        """Apply albumentations transform to a cropped face array and return a CHW tensor."""
+        result = self.transf(image=face_np)
+        return result["image"]
+    def _calibrate(self, score: float) -> float:
+        """Apply isotonic calibration if available; otherwise return score unchanged."""
+        if self.calibrator is None:
+            return score
+        try:
+            return float(self.calibrator.predict([[score]])[0])
+        except Exception:
+            return score
+    def _calibrate_batch(self, scores: np.ndarray) -> np.ndarray:
+        """Apply isotonic calibration to a 1-D array of scores."""
+        if self.calibrator is None:
+            return scores
+        try:
+            return self.calibrator.predict(scores.reshape(-1, 1)).flatten()
+        except Exception:
+            return scores
+    def raw_logit(self, face_tensor: torch.Tensor) -> float:
+        """Return raw logit for a single face tensor — used by fit_calibrator.py."""
+        with torch.inference_mode():
+            return float(self.net(face_tensor.unsqueeze(0).to(self.device)).item())
+    def detect_image(self, pil_image: Image.Image) -> dict:
+        """Run EfficientNet on a single PIL image.
+        Returns:
+            {"score": float|None, "result": "FAKE"|"REAL"|None, "model": str,
+             "error": str|None, "calibrator_applied": bool}
+        """
+        if pil_image.mode != "RGB":
+            pil_image = pil_image.convert("RGB")
+        img_array = np.array(pil_image)
+        frame_data = self.face_extractor.process_image(img=img_array)
+        faces: list = frame_data.get("faces", [])
+        if not faces:
+            logger.debug("EfficientNetDetector.detect_image: no face detected")
+            return {
+                "error": "no_face",
+                "score": None,
+                "result": None,
+                "model": f"{self.model_name}_{self.train_db}",
+                "calibrator_applied": False,
+            }
+        face_t = self._face_tensor(faces[0])
+        with torch.inference_mode():
+            logit = self.net(face_t.unsqueeze(0).to(self.device))
+            raw_score = float(torch.sigmoid(logit).item())
+        score = self._calibrate(raw_score)
+        return {
+            "score": score,
+            "result": "FAKE" if score > 0.5 else "REAL",
+            "model": f"{self.model_name}_{self.train_db}",
+            "error": None,
+            "calibrator_applied": self.calibrator_applied,
+        }
+    def detect_video_frames(self, frames: List[np.ndarray]) -> dict:
+        """Run EfficientNet on a list of BGR/RGB numpy frames (as extracted by OpenCV).
+        Returns:
+            {"mean_score": float|None, "per_frame": list[float], "model": str,
+             "error": str|None, "calibrator_applied": bool}
+        """
+        face_tensors: list[torch.Tensor] = []
+        for frame in frames:
+            # Ensure RGB — OpenCV yields BGR, PIL already RGB.
+            if frame.ndim == 3 and frame.shape[2] == 3:
+                frame_rgb = frame[..., ::-1].copy() if frame.dtype == np.uint8 else frame
+            else:
+                frame_rgb = frame
+            frame_data = self.face_extractor.process_image(img=frame_rgb)
+            faces: list = frame_data.get("faces", [])
+            if faces:
+                face_tensors.append(self._face_tensor(faces[0]))
+        if not face_tensors:
+            logger.debug("EfficientNetDetector.detect_video_frames: no faces in any frame")
+            return {
+                "error": "no_faces",
+                "mean_score": None,
+                "per_frame": [],
+                "model": f"{self.model_name}_{self.train_db}",
+                "calibrator_applied": False,
+            }
+        batch = torch.stack(face_tensors).to(self.device)
+        with torch.inference_mode():
+            logits = self.net(batch).cpu().numpy().flatten()
+        raw_per_frame = expit(logits)
+        per_frame = self._calibrate_batch(raw_per_frame).tolist()
+        mean_score = float(self._calibrate(float(expit(np.mean(logits)))))
+        return {
+            "mean_score": mean_score,
+            "per_frame": per_frame,
+            "model": f"{self.model_name}_{self.train_db}",
+            "error": None,
+            "calibrator_applied": self.calibrator_applied,
+        }

services/image_service.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from __future__ import annotations
 import io
-from dataclasses import dataclass
-from typing import Tuple
 import torch
 from loguru import logger
@@ -17,6 +17,8 @@ class ImageClassification:
     label: str
     confidence: float
     all_scores: dict[str, float]
 def load_image_from_bytes(data: bytes) -> Image.Image:
@@ -26,8 +28,8 @@ def load_image_from_bytes(data: bytes) -> Image.Image:
     return img
-def classify_image(pil_img: Image.Image) -> ImageClassification:
-    """Run the ViT deepfake classifier on a PIL image."""
     loader = get_model_loader()
     model, processor = loader.load_image_model()
@@ -36,17 +38,88 @@ def classify_image(pil_img: Image.Image) -> ImageClassification:
     with torch.no_grad():
         outputs = model(**inputs)
-        logits = outputs.logits  # (1, num_labels)
         probs = torch.softmax(logits, dim=-1)[0]
     id2label: dict[int, str] = getattr(model.config, "id2label", {})
     all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
     top_idx = int(torch.argmax(probs).item())
     top_label = id2label.get(top_idx, str(top_idx))
-    top_conf = float(probs[top_idx].item())
-    logger.info(f"Image classify → {top_label} @ {top_conf:.3f}")
-    return ImageClassification(label=top_label, confidence=top_conf, all_scores=all_scores)
 def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:

 from __future__ import annotations
 import io
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple
 import torch
 from loguru import logger
     label: str
     confidence: float
     all_scores: dict[str, float]
+    models_used: List[str] = field(default_factory=list)
+    ensemble_method: Optional[str] = None
 def load_image_from_bytes(data: bytes) -> Image.Image:
     return img
+def _classify_vit(pil_img: Image.Image) -> Tuple[float, str, dict[str, float]]:
+    """Run the ViT deepfake classifier. Returns (fake_prob, top_label, all_scores)."""
     loader = get_model_loader()
     model, processor = loader.load_image_model()
     with torch.no_grad():
         outputs = model(**inputs)
+        logits = outputs.logits
         probs = torch.softmax(logits, dim=-1)[0]
     id2label: dict[int, str] = getattr(model.config, "id2label", {})
     all_scores = {id2label.get(i, str(i)): float(p.item()) for i, p in enumerate(probs)}
     top_idx = int(torch.argmax(probs).item())
     top_label = id2label.get(top_idx, str(top_idx))
+    # Identify the fake probability — pick the highest score from fake-labelled classes.
+    fake_tokens = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
+    fake_prob = max(
+        (float(p) for lbl, p in all_scores.items() if any(t in lbl.lower() for t in fake_tokens)),
+        default=float(probs[top_idx].item()),
+    )
+    return fake_prob, top_label, all_scores
+def classify_image(pil_img: Image.Image) -> ImageClassification:
+    """Run deepfake classification. Uses ensemble (ViT + EfficientNet) when ENSEMBLE_MODE=true,
+    falls back to ViT-only when EfficientNet is unavailable or ENSEMBLE_MODE=false.
+    """
+    vit_fake_prob, vit_label, vit_scores = _classify_vit(pil_img)
+    models_used = [settings.IMAGE_MODEL_ID]
+    if not settings.ENSEMBLE_MODE:
+        logger.info(f"Image classify (ViT-only) → {vit_label} @ fake_p={vit_fake_prob:.3f}")
+        label = "Fake" if vit_fake_prob >= 0.5 else "Real"
+        return ImageClassification(
+            label=label,
+            confidence=vit_fake_prob,
+            all_scores=vit_scores,
+            models_used=models_used,
+            ensemble_method=None,
+        )
+    # Attempt EfficientNet inference.
+    loader = get_model_loader()
+    eff_detector = loader.load_efficientnet()
+    if eff_detector is None:
+        logger.warning("EfficientNet unavailable — falling back to ViT-only")
+        label = "Fake" if vit_fake_prob >= 0.5 else "Real"
+        return ImageClassification(
+            label=label,
+            confidence=vit_fake_prob,
+            all_scores=vit_scores,
+            models_used=models_used,
+            ensemble_method=None,
+        )
+    eff_result = eff_detector.detect_image(pil_img)
+    if eff_result.get("error") or eff_result.get("score") is None:
+        # BlazeFace found no face — trust ViT alone.
+        logger.info(f"EfficientNet no-face fallback → using ViT score {vit_fake_prob:.3f}")
+        label = "Fake" if vit_fake_prob >= 0.5 else "Real"
+        return ImageClassification(
+            label=label,
+            confidence=vit_fake_prob,
+            all_scores=vit_scores,
+            models_used=models_used,
+            ensemble_method="vit_only_no_face",
+        )
+    eff_fake_prob: float = eff_result["score"]
+    models_used.append(eff_result["model"])
+    # Simple average ensemble.
+    ensemble_prob = (vit_fake_prob + eff_fake_prob) / 2.0
+    label = "Fake" if ensemble_prob >= 0.5 else "Real"
+    logger.info(
+        f"Image classify (ensemble) → {label} | vit={vit_fake_prob:.3f} eff={eff_fake_prob:.3f} avg={ensemble_prob:.3f}"
+    )
+    return ImageClassification(
+        label=label,
+        confidence=ensemble_prob,
+        all_scores={
+            **{f"vit_{k}": v for k, v in vit_scores.items()},
+            f"efficientnet_fake": eff_fake_prob,
+            f"efficientnet_real": 1.0 - eff_fake_prob,
+        },
+        models_used=models_used,
+        ensemble_method="average",
+    )
 def preprocess_and_classify(raw_bytes: bytes) -> Tuple[Image.Image, ImageClassification]:

services/metadata_writer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""Optional ExifTool metadata writer — embeds DeepShield verdict into analyzed file metadata.
+Gated behind EXIFTOOL_PATH env var. Silently skips if ExifTool is not configured.
+Install ExifTool: https://exiftool.org/ — set EXIFTOOL_PATH in .env to enable.
+"""
+from __future__ import annotations
+import subprocess
+from pathlib import Path
+from typing import Optional
+from loguru import logger
+from config import settings
+def _exiftool_path() -> Optional[str]:
+    path = getattr(settings, "EXIFTOOL_PATH", "")
+    if path and Path(path).is_file():
+        return path
+    return None
+def write_verdict_metadata(
+    file_path: str,
+    verdict: str,
+    authenticity_score: int,
+    models_used: list[str],
+    analysis_id: str,
+) -> bool:
+    """Embed DeepShield analysis verdict into the file's EXIF/metadata via ExifTool.
+    Returns True if metadata was written, False if ExifTool is not configured or write failed.
+    """
+    exiftool = _exiftool_path()
+    if not exiftool:
+        return False
+    comment = (
+        f"DeepShield verdict: {verdict} | "
+        f"score: {authenticity_score} | "
+        f"models: {','.join(models_used)} | "
+        f"id: {analysis_id}"
+    )
+    try:
+        result = subprocess.run(
+            [
+                exiftool,
+                f"-Comment={comment}",
+                f"-UserComment={comment}",
+                "-overwrite_original",
+                file_path,
+            ],
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        if result.returncode == 0:
+            logger.info(f"ExifTool wrote verdict metadata to {file_path}")
+            return True
+        else:
+            logger.warning(f"ExifTool failed (rc={result.returncode}): {result.stderr.strip()}")
+            return False
+    except FileNotFoundError:
+        logger.warning(f"ExifTool not found at {exiftool}")
+        return False
+    except subprocess.TimeoutExpired:
+        logger.warning("ExifTool timed out writing metadata")
+        return False
+    except Exception as e:
+        logger.warning(f"ExifTool metadata write failed: {e}")
+        return False

services/video_service.py CHANGED Viewed

@@ -1,15 +1,16 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
-from typing import List, Tuple
 import cv2
 import numpy as np
 from loguru import logger
 from PIL import Image
 from models.model_loader import get_model_loader
-from services.image_service import classify_image
 @dataclass
@@ -18,10 +19,10 @@ class FrameAnalysis:
     timestamp_s: float
     label: str
     confidence: float
-    suspicious_prob: float  # prob of the fake/manipulated class
     is_suspicious: bool
     has_face: bool = False
-    scored: bool = False  # contributed to aggregate (face frames only)
 @dataclass
@@ -35,6 +36,8 @@ class VideoAggregation:
     insufficient_faces: bool
     suspicious_timestamps: List[float] = field(default_factory=list)
     frames: List[FrameAnalysis] = field(default_factory=list)
 FAKE_TOKENS = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
@@ -45,9 +48,9 @@ def _is_fake_label(label: str) -> bool:
     return any(tok in l for tok in FAKE_TOKENS)
-def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, float, Image.Image]]:
-    """Uniformly sample num_frames frames from the video. Returns list of
-    (frame_index, timestamp_seconds, PIL.Image).
     """
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
@@ -62,7 +65,7 @@ def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, flo
     n = min(num_frames, total)
     indices = np.linspace(0, max(0, total - 1), num=n, dtype=int).tolist()
-    out: List[Tuple[int, float, Image.Image]] = []
     for idx in indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
         ok, frame_bgr = cap.read()
@@ -71,40 +74,97 @@ def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, flo
         frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
         pil = Image.fromarray(frame_rgb)
         ts = (idx / fps) if fps > 0 else 0.0
-        out.append((int(idx), float(ts), pil))
     cap.release()
     logger.info(f"Extracted {len(out)}/{n} frames from video (total={total}, fps={fps:.2f})")
     return out
-MIN_FACE_FRAMES = 3  # below this we refuse to issue a deepfake verdict
-def _has_face(pil: Image.Image) -> bool:
     detector = get_model_loader().load_face_detector()
     arr = np.array(pil)
     res = detector.process(arr)
     return bool(getattr(res, "multi_face_landmarks", None))
-def classify_frames(frames: List[Tuple[int, float, Image.Image]]) -> List[FrameAnalysis]:
     results: List[FrameAnalysis] = []
-    for idx, ts, pil in frames:
-        face = _has_face(pil)
-        clf = classify_image(pil)
         fake_prob = 0.0
-        for lbl, p in clf.all_scores.items():
-            if _is_fake_label(lbl):
-                fake_prob = max(fake_prob, float(p))
         results.append(
             FrameAnalysis(
                 index=idx,
                 timestamp_s=ts,
-                label=clf.label,
-                confidence=clf.confidence,
                 suspicious_prob=fake_prob,
-                is_suspicious=(fake_prob >= 0.5) and face,
                 has_face=face,
                 scored=face,
             )
@@ -112,18 +172,20 @@ def classify_frames(frames: List[Tuple[int, float, Image.Image]]) -> List[FrameA
     return results
-def aggregate(frames: List[FrameAnalysis]) -> VideoAggregation:
-    if not frames:
         return VideoAggregation(0, 0, 0, 0.0, 0.0, 0.0, True)
-    scored = [f for f in frames if f.scored]
     num_face = len(scored)
     insufficient = num_face < MIN_FACE_FRAMES
     if insufficient:
-        mean_p = 0.0
-        max_p = 0.0
-        susp_ratio = 0.0
         susp: List[FrameAnalysis] = []
     else:
         probs = [f.suspicious_prob for f in scored]
@@ -133,19 +195,28 @@ def aggregate(frames: List[FrameAnalysis]) -> VideoAggregation:
         susp_ratio = len(susp) / len(scored)
     return VideoAggregation(
-        num_frames_sampled=len(frames),
         num_face_frames=num_face,
-        num_suspicious_frames=len(susp),
         mean_suspicious_prob=mean_p,
         max_suspicious_prob=max_p,
         suspicious_ratio=susp_ratio,
         insufficient_faces=insufficient,
-        suspicious_timestamps=[round(f.timestamp_s, 2) for f in susp],
-        frames=frames,
     )
 def analyze_video(video_path: str, num_frames: int = 16) -> VideoAggregation:
     frames = extract_frames(video_path, num_frames=num_frames)
-    classified = classify_frames(frames)
-    return aggregate(classified)

 from __future__ import annotations
 from dataclasses import dataclass, field
+from typing import List, Optional, Tuple
 import cv2
 import numpy as np
 from loguru import logger
 from PIL import Image
+from config import settings
 from models.model_loader import get_model_loader
+from services.image_service import _classify_vit
 @dataclass
     timestamp_s: float
     label: str
     confidence: float
+    suspicious_prob: float
     is_suspicious: bool
     has_face: bool = False
+    scored: bool = False
 @dataclass
     insufficient_faces: bool
     suspicious_timestamps: List[float] = field(default_factory=list)
     frames: List[FrameAnalysis] = field(default_factory=list)
+    models_used: List[str] = field(default_factory=list)
+    face_detector_used: str = "mediapipe"
 FAKE_TOKENS = ("fake", "deepfake", "manipulated", "ai", "generated", "synthetic")
     return any(tok in l for tok in FAKE_TOKENS)
+def extract_frames(video_path: str, num_frames: int = 16) -> List[Tuple[int, float, np.ndarray, Image.Image]]:
+    """Uniformly sample num_frames frames from the video.
+    Returns list of (frame_index, timestamp_seconds, bgr_numpy, PIL.Image).
     """
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
     n = min(num_frames, total)
     indices = np.linspace(0, max(0, total - 1), num=n, dtype=int).tolist()
+    out: List[Tuple[int, float, np.ndarray, Image.Image]] = []
     for idx in indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
         ok, frame_bgr = cap.read()
         frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
         pil = Image.fromarray(frame_rgb)
         ts = (idx / fps) if fps > 0 else 0.0
+        out.append((int(idx), float(ts), frame_bgr, pil))
     cap.release()
     logger.info(f"Extracted {len(out)}/{n} frames from video (total={total}, fps={fps:.2f})")
     return out
+MIN_FACE_FRAMES = 3
+def _has_face_mediapipe(pil: Image.Image) -> bool:
     detector = get_model_loader().load_face_detector()
     arr = np.array(pil)
     res = detector.process(arr)
     return bool(getattr(res, "multi_face_landmarks", None))
+def _analyze_with_efficientnet(
+    frames: List[Tuple[int, float, np.ndarray, Image.Image]],
+) -> Tuple[List[FrameAnalysis], str, List[str]]:
+    """Primary path: use EfficientNet + BlazeFace per-frame. Returns (frame_results, detector_used, models_used)."""
+    loader = get_model_loader()
+    eff = loader.load_efficientnet()
+    if eff is None:
+        logger.warning("EfficientNet unavailable — falling back to ViT video pipeline")
+        return _analyze_with_vit(frames), "mediapipe", [settings.IMAGE_MODEL_ID]
     results: List[FrameAnalysis] = []
+    face_detector_used = "blazeface"
+    models_used = [f"{settings.EFFICIENTNET_MODEL}_{settings.EFFICIENTNET_TRAIN_DB}"]
+    for idx, ts, frame_bgr, pil in frames:
+        # Pass RGB to EfficientNet (process_image expects RGB array).
+        frame_rgb = frame_bgr[..., ::-1].copy()
+        frame_data = eff.face_extractor.process_image(img=frame_rgb)
+        faces: list = frame_data.get("faces", [])
+        has_face = bool(faces)
+        if not has_face:
+            # Fallback: check MediaPipe so we don't silently miss faces.
+            has_face = _has_face_mediapipe(pil)
+            if has_face:
+                face_detector_used = "blazeface+mediapipe_fallback"
         fake_prob = 0.0
+        label = "unknown"
+        if has_face and faces:
+            # Run EfficientNet on the best face from BlazeFace.
+            face_t = eff._face_tensor(faces[0])
+            import torch
+            with torch.inference_mode():
+                logit = eff.net(face_t.unsqueeze(0).to(eff.device))
+                from scipy.special import expit
+                fake_prob = float(expit(logit.cpu().numpy().item()))
+            label = "Fake" if fake_prob > 0.5 else "Real"
+        elif not has_face:
+            label = "no_face"
         results.append(
             FrameAnalysis(
                 index=idx,
                 timestamp_s=ts,
+                label=label,
+                confidence=fake_prob,
                 suspicious_prob=fake_prob,
+                is_suspicious=(fake_prob >= 0.5) and has_face,
+                has_face=has_face,
+                scored=has_face,
+            )
+        )
+    return results, face_detector_used, models_used
+def _analyze_with_vit(
+    frames: List[Tuple[int, float, np.ndarray, Image.Image]],
+) -> List[FrameAnalysis]:
+    """Fallback: original ViT-per-frame pipeline (MediaPipe face gate)."""
+    results: List[FrameAnalysis] = []
+    for idx, ts, _bgr, pil in frames:
+        face = _has_face_mediapipe(pil)
+        vit_fake_prob, vit_label, _ = _classify_vit(pil)
+        results.append(
+            FrameAnalysis(
+                index=idx,
+                timestamp_s=ts,
+                label=vit_label,
+                confidence=vit_fake_prob,
+                suspicious_prob=vit_fake_prob,
+                is_suspicious=(vit_fake_prob >= 0.5) and face,
                 has_face=face,
                 scored=face,
             )
     return results
+def aggregate(
+    frame_results: List[FrameAnalysis],
+    models_used: Optional[List[str]] = None,
+    face_detector_used: str = "mediapipe",
+) -> VideoAggregation:
+    if not frame_results:
         return VideoAggregation(0, 0, 0, 0.0, 0.0, 0.0, True)
+    scored = [f for f in frame_results if f.scored]
     num_face = len(scored)
     insufficient = num_face < MIN_FACE_FRAMES
     if insufficient:
+        mean_p, max_p, susp_ratio = 0.0, 0.0, 0.0
         susp: List[FrameAnalysis] = []
     else:
         probs = [f.suspicious_prob for f in scored]
         susp_ratio = len(susp) / len(scored)
     return VideoAggregation(
+        num_frames_sampled=len(frame_results),
         num_face_frames=num_face,
+        num_suspicious_frames=len(susp) if not insufficient else 0,
         mean_suspicious_prob=mean_p,
         max_suspicious_prob=max_p,
         suspicious_ratio=susp_ratio,
         insufficient_faces=insufficient,
+        suspicious_timestamps=[round(f.timestamp_s, 2) for f in (susp if not insufficient else [])],
+        frames=frame_results,
+        models_used=models_used or [settings.IMAGE_MODEL_ID],
+        face_detector_used=face_detector_used,
     )
 def analyze_video(video_path: str, num_frames: int = 16) -> VideoAggregation:
     frames = extract_frames(video_path, num_frames=num_frames)
+    if settings.ENSEMBLE_MODE:
+        frame_results, face_detector_used, models_used = _analyze_with_efficientnet(frames)
+    else:
+        frame_results = _analyze_with_vit(frames)
+        face_detector_used = "mediapipe"
+        models_used = [settings.IMAGE_MODEL_ID]
+    return aggregate(frame_results, models_used=models_used, face_detector_used=face_detector_used)

v1/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (165 Bytes)

v1/__pycache__/analyze.cpython-311.pyc DELETED Viewed

Binary file (21.6 kB)

v1/__pycache__/auth.cpython-311.pyc DELETED Viewed

Binary file (3.82 kB)

v1/__pycache__/health.cpython-311.pyc DELETED Viewed

Binary file (556 Bytes)

v1/__pycache__/history.cpython-311.pyc DELETED Viewed

Binary file (5.19 kB)

v1/__pycache__/report.cpython-311.pyc DELETED Viewed

Binary file (4.29 kB)

v1/analyze.py CHANGED Viewed

@@ -55,6 +55,7 @@ from services.text_service import (
     score_sensationalism,
 )
 from services.video_service import analyze_video
 from utils.file_handler import read_upload_bytes, save_upload_to_tempfile
 from utils.scoring import compute_authenticity_score, get_verdict_label
@@ -89,7 +90,10 @@ async def analyze_image(
     heatmap_status = "success"
     heatmap = ""
     try:
-        heatmap = generate_heatmap_base64(pil)
         stages.append("heatmap_generation")
     except Exception as e:  # noqa: BLE001
         logger.warning(f"Heatmap generation failed, continuing: {e}")
@@ -155,6 +159,7 @@ async def analyze_image(
             stages_completed=stages,
             total_duration_ms=duration_ms,
             model_used=settings.IMAGE_MODEL_ID,
         ),
     )
@@ -218,11 +223,12 @@ async def analyze_video_endpoint(
         stages.append("frame_extraction")
         stages.append("frame_classification")
         stages.append("aggregation")
-    finally:
         try:
             os.unlink(path)
         except OSError:
             pass
     if agg.insufficient_faces:
         score = 50
@@ -271,6 +277,7 @@ async def analyze_video_endpoint(
             stages_completed=stages,
             total_duration_ms=duration_ms,
             model_used=settings.IMAGE_MODEL_ID,
         ),
     )
@@ -290,6 +297,23 @@ async def analyze_video_endpoint(
         f"frames={agg.num_frames_sampled} susp={agg.num_suspicious_frames}"
     )
     # Phase 12: LLM explainability card
     try:
         response.llm_summary = generate_llm_summary(

     score_sensationalism,
 )
 from services.video_service import analyze_video
+from services.metadata_writer import write_verdict_metadata
 from utils.file_handler import read_upload_bytes, save_upload_to_tempfile
 from utils.scoring import compute_authenticity_score, get_verdict_label
     heatmap_status = "success"
     heatmap = ""
     try:
+        model_family = "efficientnet" if settings.ENSEMBLE_MODE else "vit"
+        heatmap, heatmap_source = generate_heatmap_base64(pil, model_family=model_family)
+        if not heatmap:
+            heatmap_status = heatmap_source  # "none" or "fallback"
         stages.append("heatmap_generation")
     except Exception as e:  # noqa: BLE001
         logger.warning(f"Heatmap generation failed, continuing: {e}")
             stages_completed=stages,
             total_duration_ms=duration_ms,
             model_used=settings.IMAGE_MODEL_ID,
+            models_used=clf.models_used,
         ),
     )
         stages.append("frame_extraction")
         stages.append("frame_classification")
         stages.append("aggregation")
+    except Exception:
         try:
             os.unlink(path)
         except OSError:
             pass
+        raise
     if agg.insufficient_faces:
         score = 50
             stages_completed=stages,
             total_duration_ms=duration_ms,
             model_used=settings.IMAGE_MODEL_ID,
+            models_used=agg.models_used,
         ),
     )
         f"frames={agg.num_frames_sampled} susp={agg.num_suspicious_frames}"
     )
+    # Write verdict into video metadata (ExifTool, optional — gated by EXIFTOOL_PATH).
+    try:
+        write_verdict_metadata(
+            file_path=path,
+            verdict=label,
+            authenticity_score=score,
+            models_used=agg.models_used,
+            analysis_id=str(record.id),
+        )
+    except Exception as e:  # noqa: BLE001
+        logger.warning(f"Metadata write failed: {e}")
+    finally:
+        try:
+            os.unlink(path)
+        except OSError:
+            pass
     # Phase 12: LLM explainability card
     try:
         response.llm_summary = generate_llm_summary(