Spaces:

ar07xd
/

deepshield

Running

App Files Files Community

ar07xd commited on 25 days ago

Commit

c3c3ac6

verified ·

1 Parent(s): 605e747

Sync from GitHub via hub-sync

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +25 -0
models/icpr2020dfdc/.gitignore +5 -0
models/icpr2020dfdc/.travis.yml +15 -0
models/icpr2020dfdc/LICENSE +674 -0
models/icpr2020dfdc/README.md +120 -0
models/icpr2020dfdc/architectures/__init__.py +0 -0
models/icpr2020dfdc/architectures/externals/__init__.py +1 -0
models/icpr2020dfdc/architectures/externals/xception.py +236 -0
models/icpr2020dfdc/architectures/fornet.py +245 -0
models/icpr2020dfdc/architectures/tripletnet.py +44 -0
models/icpr2020dfdc/architectures/weights.py +24 -0
models/icpr2020dfdc/assets/cnfidfeyln_face.gif +3 -0
models/icpr2020dfdc/assets/cnfidfeyln_face_att.gif +3 -0
models/icpr2020dfdc/assets/faces_attention.png +3 -0
models/icpr2020dfdc/assets/mqzvfufzoq_face.gif +3 -0
models/icpr2020dfdc/assets/mqzvfufzoq_face_att.gif +3 -0
models/icpr2020dfdc/blazeface/__init__.py +3 -0
models/icpr2020dfdc/blazeface/anchors.npy +3 -0
models/icpr2020dfdc/blazeface/blazeface.pth +3 -0
models/icpr2020dfdc/blazeface/blazeface.py +417 -0
models/icpr2020dfdc/blazeface/face_extract.py +470 -0
models/icpr2020dfdc/blazeface/read_video.py +213 -0
models/icpr2020dfdc/environment.yml +25 -0
models/icpr2020dfdc/extract_faces.py +346 -0
models/icpr2020dfdc/index_celebdf.py +85 -0
models/icpr2020dfdc/index_dfdc.py +94 -0
models/icpr2020dfdc/index_ffpp.py +92 -0
models/icpr2020dfdc/isplutils/__init__.py +0 -0
models/icpr2020dfdc/isplutils/data.py +263 -0
models/icpr2020dfdc/isplutils/data_siamese.py +78 -0
models/icpr2020dfdc/isplutils/split.py +135 -0
models/icpr2020dfdc/isplutils/utils.py +247 -0
models/icpr2020dfdc/notebook/Analyze results net fusion paper.ipynb +0 -0
models/icpr2020dfdc/notebook/Analyze results.ipynb +193 -0
models/icpr2020dfdc/notebook/Image prediction and attention.ipynb +0 -0
models/icpr2020dfdc/notebook/Image prediction.ipynb +0 -0
models/icpr2020dfdc/notebook/Video prediction.ipynb +0 -0
models/icpr2020dfdc/notebook/Visualise attention and features.ipynb +0 -0
models/icpr2020dfdc/notebook/Xception train val curves.ipynb +0 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-10fpv_face-scale_size-224_seed-41-tag-train_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-10fpv_face-scale_size-224_seed-41-tag-val_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-15fpv_face-scale_size-224_seed-41-tag-train_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-15fpv_face-scale_size-224_seed-41-tag-val_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-20fpv_face-scale_size-224_seed-41-tag-train_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-20fpv_face-scale_size-224_seed-41-tag-val_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-25fpv_face-scale_size-224_seed-41-tag-train_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-25fpv_face-scale_size-224_seed-41-tag-val_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-5fpv_face-scale_size-224_seed-41-tag-train_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140-5fpv_face-scale_size-224_seed-41-tag-val_loss.json +1 -0
models/icpr2020dfdc/notebook/fpv/run-binclass_net-Xception_traindb-ff-c23-720-140-140_face-scale_size-224_seed-41-tag-train_loss.json +1 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,28 @@ media/2f/2f7d41a5b57702a9a238409e6a1b973b4398f94c51fdf447e11782ed07693f06.jpg fi
 media/63/635f21138244fc1dcbff5d0525b3c0a8187b1b9cc0ad90b5bb297a76e7b3850c.jpg filter=lfs diff=lfs merge=lfs -text
 media/7b/7b626d0ddff59ca602e2e1eb02e62e21093aa647ab53c200ca5203f7fc17f6dd.jpg filter=lfs diff=lfs merge=lfs -text
 media/c0/c064c839c9469d7b616db135f08e09235abd3d73f0889d978d1f92243226a028.jpg filter=lfs diff=lfs merge=lfs -text

 media/63/635f21138244fc1dcbff5d0525b3c0a8187b1b9cc0ad90b5bb297a76e7b3850c.jpg filter=lfs diff=lfs merge=lfs -text
 media/7b/7b626d0ddff59ca602e2e1eb02e62e21093aa647ab53c200ca5203f7fc17f6dd.jpg filter=lfs diff=lfs merge=lfs -text
 media/c0/c064c839c9469d7b616db135f08e09235abd3d73f0889d978d1f92243226a028.jpg filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/assets/cnfidfeyln_face.gif filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/assets/cnfidfeyln_face_att.gif filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/assets/faces_attention.png filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/assets/mqzvfufzoq_face.gif filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/assets/mqzvfufzoq_face_att.gif filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/notebook/samples/lynaeydofd.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/notebook/samples/lynaeydofd_fr0.jpg filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/notebook/samples/mqzvfufzoq.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/notebook/samples/mqzvfufzoq_fr0.jpg filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/dfdc/dfdc_train_part_0/awnfpubqmo.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/dfdc/dfdc_train_part_0/brtujopkby.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/dfdc/dfdc_train_part_1/vtfpbtmgfh.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/dfdc/dfdc_train_part_1/zvqinhzeah.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/dfdc/dfdc_train_part_10/widuwuoiur.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/dfdc/dfdc_train_part_10/yhffcuhhjy.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/manipulated_sequences/DeepFakeDetection/c23/videos/24_23__outside_talking_still_laughing__YR5OVD4S.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/manipulated_sequences/Deepfakes/c23/videos/519_515.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/manipulated_sequences/Face2Face/c23/videos/750_743.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/manipulated_sequences/FaceSwap/c23/videos/634_660.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/manipulated_sequences/NeuralTextures/c23/videos/004_982.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/original_sequences/actors/c23/videos/24__outside_talking_still_laughing.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/original_sequences/youtube/c23/videos/004.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/original_sequences/youtube/c23/videos/519.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/original_sequences/youtube/c23/videos/634.mp4 filter=lfs diff=lfs merge=lfs -text
+models/icpr2020dfdc/test/data/ffpp/original_sequences/youtube/c23/videos/750.mp4 filter=lfs diff=lfs merge=lfs -text

models/icpr2020dfdc/.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.idea/
+.DS_Store
+.ipynb_checkpoints/
+__pycache__/

models/icpr2020dfdc/.travis.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+language: python
+python:
+  - "3.6.9"
+install:
+  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/miniconda.sh
+  - bash $HOME/miniconda.sh -bfp $HOME/miniconda3
+  - export PATH=$HOME/miniconda3/bin:$PATH
+  - conda env create -f environment.yml
+before_script:
+  - source activate icpr2020
+  - cd test
+script:
+  - python -m unittest test_dfdc.TestDFDC
+  - python -m unittest test_ffpp.TestFFPP

models/icpr2020dfdc/LICENSE ADDED Viewed

	@@ -0,0 +1,674 @@

+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Use with the GNU Affero General Public License.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.

models/icpr2020dfdc/README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+# Video Face Manipulation Detection Through Ensemble of CNNs
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/video-face-manipulation-detection-through/deepfake-detection-on-dfdc)](https://paperswithcode.com/sota/deepfake-detection-on-dfdc?p=video-face-manipulation-detection-through)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/video-face-manipulation-detection-through/deepfake-detection-on-faceforensics-1)](https://paperswithcode.com/sota/deepfake-detection-on-faceforensics-1?p=video-face-manipulation-detection-through)
+[![Build Status](https://travis-ci.org/polimi-ispl/icpr2020dfdc.svg?branch=master)](https://travis-ci.org/polimi-ispl/icpr2020dfdc)
+![](assets/faces_attention.png)
+<p align='center'>
+  <img src='assets/mqzvfufzoq_face.gif'/>
+  <img src='assets/mqzvfufzoq_face_att.gif'/>
+</p>
+This is the official repository of **Video Face Manipulation Detection Through Ensemble of CNNs**,
+presented at [ICPR2020](https://www.micc.unifi.it/icpr2020/) and currently available on [IEEExplore](https://ieeexplore.ieee.org/document/9412711) and [arXiv](https://arxiv.org/abs/2004.07676).
+If you use this repository for your research, please consider citing our paper. Refer to [How to cite](https://github.com/polimi-ispl/icpr2020dfdc#how-to-cite) section to get the correct entry for your bibliography.
+We participated as the **ISPL** team in the [Kaggle Deepfake Detection Challenge](https://www.kaggle.com/c/deepfake-detection-challenge/).
+With this implementation, we reached the 41st position over 2116 teams (**top 2%**) on the [private leaderboard](https://www.kaggle.com/c/deepfake-detection-challenge/leaderboard).
+This repository is currently under maintenance, if you are experiencing any problems, please open an [issue](https://github.com/polimi-ispl/icpr2020dfdc/issues).
+## Getting started
+### Prerequisites
+- Install [conda](https://docs.conda.io/en/latest/miniconda.html)
+- Create the `icpr2020` environment with *environment.yml*
+```bash
+$ conda env create -f environment.yml
+$ conda activate icpr2020
+```
+- Download and unzip the [datasets](#datasets)
+### Quick run
+If you just want to test the pre-trained models against your own videos or images:
+- [Video prediction notebook](https://github.com/polimi-ispl/icpr2020dfdc/blob/master/notebook/Video%20prediction.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/12WnvmerHBNbJ49HdoH1lli_O8SwaFPjv?usp=sharing">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg">
+</a>
+- [Image prediction notebook](https://github.com/polimi-ispl/icpr2020dfdc/blob/master/notebook/Image%20prediction.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/19oVKlzEr58VZfRnSq-nW8kFYuxkh3GM8?usp=sharing">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg">
+</a>
+- [Image prediction with attention](notebook/Image%20prediction%20and%20attention.ipynb) <a target="_blank" href="https://colab.research.google.com/drive/1zcglis2Qx2vtJhrogn8aKA-mbUotLZLK?usp=sharing">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg">
+</a>
+### The whole pipeline
+You need to preprocess the datasets in order to index all the samples and extract faces. Just run the script [make_dataset.sh](scripts/make_dataset.sh)
+```bash
+$ ./scripts/make_dataset.sh
+```
+Please note that we use only 32 frames per video. You can easily tweak this parameter in [extract_faces.py](extract_faces.py)
+Also, please note that **for the DFDC** we have resorted to _the training split_ exclusively!
+In `scripts/make_dataset.sh` the value of `DFDC_SRC` should point to the directory containing the DFDC train split.
+### Celeb-DF (v2)
+Altough **we did not use this dataset in the paper**, we provide a script [index_celebdf.py](index_celebdf.py) to index the videos similarly to
+DFDC and FF++. Once you have the index, you can proceed with the pipeline starting from [extract_faces.py](extract_faces.py). You can also use the
+split `celebdf` during training/testing.
+### Train
+In [train_all.sh](scripts/train_all.sh) you can find a comprehensive list of all the commands to train the models presented in the paper.
+Please refer to the comments in the script for hints on their usage.
+#### Training a single model
+If you want to train some models without lunching the script:
+- for the **non-siamese** architectures (e.g. EfficientNetB4, EfficientNetB4Att), you can simply specify the model in [train_binclass.py](train_binclass.py) with the *--net* parameter;
+- for the **siamese** architectures (e.g. EfficientNetB4ST, EfficientNetB4AttST), you have to:
+  1. train the architecture as a feature extractor first, using the [train_triplet.py](train_triplet.py) script and being careful of specifying its name with the *--net* parameter **without** the ST suffix. For instance, for training the EfficientNetB4ST you will have to first run `python train_triplet.py --net EfficientNetB4 --otherparams`;
+  2. finetune the model using [train_binclass.py](train_binclass.py), being careful this time to specify the architecture's name **with** the ST suffix and to insert as *--init* argument the path to the weights of the feature extractor trained at the previous step. You will end up running something like `python train_binclass.py --net EfficientNetB4ST --init path/to/EfficientNetB4/weights/trained/with/train_triplet/weights.pth --otherparams`
+### Test
+In [test_all.sh](scripts/test_all.sh) you can find a comprehensive list of all the commands for testing the models presented in the paper.
+#### Pretrained weights
+We also provide pretrained weights for all the architectures presented in the paper.
+Please refer to this [Dropbox link](https://www.dropbox.com/sh/cesamx5ytd5j08c/AADG_eEmhskliMaT0Gbk-yHDa?dl=0).
+Each directory is named `$NETWORK_$DATASET` where `$NETWORK` is the architecture name and `$DATASET` is the training dataset.
+In each directory, you can find `bestval.pth` which are the best network weights according to the validation set.
+Additionally, you can find Jupyter notebooks for results computations in the [notebook](notebook) folder.
+## Datasets
+- [Facebook's DeepFake Detection Challenge (DFDC) train dataset](https://www.kaggle.com/c/deepfake-detection-challenge/data) | [arXiv paper](https://arxiv.org/abs/2006.07397)
+- [FaceForensics++](https://github.com/ondyari/FaceForensics/blob/master/dataset/README.md) | [arXiv paper](https://arxiv.org/abs/1901.08971)
+- [Celeb-DF (v2)](http://www.cs.albany.edu/~lsw/celeb-deepfakeforensics.html) | [arXiv paper](https://arxiv.org/abs/1909.12962) (**Just for reference, not used in the paper**)
+## References
+- [EfficientNet PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch)
+- [Xception PyTorch](https://github.com/tstandley/Xception-PyTorch)
+## How to cite
+Plain text:
+```
+N. Bonettini, E. D. Cannas, S. Mandelli, L. Bondi, P. Bestagini and S. Tubaro, "Video Face Manipulation Detection Through Ensemble of CNNs," 2020 25th International Conference on Pattern Recognition (ICPR), 2021, pp. 5012-5019, doi: 10.1109/ICPR48806.2021.9412711.
+```
+Bibtex:
+```bibtex
+@INPROCEEDINGS{9412711,
+  author={Bonettini, Nicolò and Cannas, Edoardo Daniele and Mandelli, Sara and Bondi, Luca and Bestagini, Paolo and Tubaro, Stefano},
+  booktitle={2020 25th International Conference on Pattern Recognition (ICPR)},
+  title={Video Face Manipulation Detection Through Ensemble of CNNs},
+  year={2021},
+  volume={},
+  number={},
+  pages={5012-5019},
+  doi={10.1109/ICPR48806.2021.9412711}}
+```
+## Credits
+[Image and Sound Processing Lab - Politecnico di Milano](http://ispl.deib.polimi.it/)
+- Nicolò Bonettini
+- Edoardo Daniele Cannas
+- Sara Mandelli
+- Luca Bondi
+- Paolo Bestagini

models/icpr2020dfdc/architectures/__init__.py ADDED Viewed

File without changes

models/icpr2020dfdc/architectures/externals/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .xception import xception

models/icpr2020dfdc/architectures/externals/xception.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch)
+@author: tstandley
+Adapted by cadene
+Creates an Xception Model as defined in:
+Francois Chollet
+Xception: Deep Learning with Depthwise Separable Convolutions
+https://arxiv.org/pdf/1610.02357.pdf
+This weights ported from the Keras implementation. Achieves the following performance on the validation set:
+Loss:0.9173 Prec@1:78.892 Prec@5:94.292
+REMEMBER to set your image size to 3x299x299 for both test and validation
+normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
+                                  std=[0.5, 0.5, 0.5])
+The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+"""
+from __future__ import print_function, division, absolute_import
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+__all__ = ['xception']
+pretrained_settings = {
+    'xception': {
+        'imagenet': {
+            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth',
+            'input_space': 'RGB',
+            'input_size': [3, 299, 299],
+            'input_range': [0, 1],
+            'mean': [0.5, 0.5, 0.5],
+            'std': [0.5, 0.5, 0.5],
+            'num_classes': 1000,
+            'scale': 0.8975
+            # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
+        }
+    }
+}
+class SeparableConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
+        super(SeparableConv2d, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride, padding, dilation, groups=in_channels,
+                               bias=bias)
+        self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=bias)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pointwise(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, in_filters, out_filters, reps, strides=1, start_with_relu=True, grow_first=True):
+        super(Block, self).__init__()
+        if out_filters != in_filters or strides != 1:
+            self.skip = nn.Conv2d(in_filters, out_filters, 1, stride=strides, bias=False)
+            self.skipbn = nn.BatchNorm2d(out_filters)
+        else:
+            self.skip = None
+        rep = []
+        filters = in_filters
+        if grow_first:
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(in_filters, out_filters, 3, stride=1, padding=1, bias=False))
+            rep.append(nn.BatchNorm2d(out_filters))
+            filters = out_filters
+        for i in range(reps - 1):
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(filters, filters, 3, stride=1, padding=1, bias=False))
+            rep.append(nn.BatchNorm2d(filters))
+        if not grow_first:
+            rep.append(nn.ReLU(inplace=True))
+            rep.append(SeparableConv2d(in_filters, out_filters, 3, stride=1, padding=1, bias=False))
+            rep.append(nn.BatchNorm2d(out_filters))
+        if not start_with_relu:
+            rep = rep[1:]
+        else:
+            rep[0] = nn.ReLU(inplace=False)
+        if strides != 1:
+            rep.append(nn.MaxPool2d(3, strides, 1))
+        self.rep = nn.Sequential(*rep)
+    def forward(self, inp):
+        x = self.rep(inp)
+        if self.skip is not None:
+            skip = self.skip(inp)
+            skip = self.skipbn(skip)
+        else:
+            skip = inp
+        x += skip
+        return x
+class Xception(nn.Module):
+    """
+    Xception optimized for the ImageNet dataset, as specified in
+    https://arxiv.org/pdf/1610.02357.pdf
+    """
+    def __init__(self, num_classes=1000):
+        """ Constructor
+        Args:
+            num_classes: number of classes
+        """
+        super(Xception, self).__init__()
+        self.num_classes = num_classes
+        self.conv1 = nn.Conv2d(3, 32, 3, 2, 0, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(32, 64, 3, bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        # do relu here
+        self.block1 = Block(64, 128, 2, 2, start_with_relu=False, grow_first=True)
+        self.block2 = Block(128, 256, 2, 2, start_with_relu=True, grow_first=True)
+        self.block3 = Block(256, 728, 2, 2, start_with_relu=True, grow_first=True)
+        self.block4 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block5 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block6 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block7 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block8 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block9 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block10 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block11 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
+        self.block12 = Block(728, 1024, 2, 2, start_with_relu=True, grow_first=False)
+        self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1)
+        self.bn3 = nn.BatchNorm2d(1536)
+        self.relu3 = nn.ReLU(inplace=True)
+        # do relu here
+        self.conv4 = SeparableConv2d(1536, 2048, 3, 1, 1)
+        self.bn4 = nn.BatchNorm2d(2048)
+        self.fc = nn.Linear(2048, num_classes)
+        # #------- init weights --------
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+        # #-----------------------------
+    def features(self, input):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu3(x)
+        x = self.conv4(x)
+        x = self.bn4(x)
+        return x
+    def logits(self, features):
+        x = nn.ReLU(inplace=True)(features)
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        x = x.view(x.size(0), -1)
+        x = self.last_linear(x)
+        return x
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+def xception(num_classes=1000, pretrained='imagenet'):
+    model = Xception(num_classes=num_classes)
+    if pretrained:
+        settings = pretrained_settings['xception'][pretrained]
+        assert num_classes == settings['num_classes'], \
+            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
+        model = Xception(num_classes=num_classes)
+        model.load_state_dict(model_zoo.load_url(settings['url']))
+        model.input_space = settings['input_space']
+        model.input_size = settings['input_size']
+        model.input_range = settings['input_range']
+        model.mean = settings['mean']
+        model.std = settings['std']
+    # TODO: ugly
+    model.last_linear = model.fc
+    del model.fc
+    return model

models/icpr2020dfdc/architectures/fornet.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+from collections import OrderedDict
+import torch
+from efficientnet_pytorch import EfficientNet
+from torch import nn as nn
+from torch.nn import functional as F
+from torchvision import transforms
+from . import externals
+"""
+Feature Extractor
+"""
+class FeatureExtractor(nn.Module):
+    """
+    Abstract class to be extended when supporting features extraction.
+    It also provides standard normalized and parameters
+    """
+    def features(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    def get_trainable_parameters(self):
+        return self.parameters()
+    @staticmethod
+    def get_normalizer():
+        return transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+"""
+EfficientNet
+"""
+class EfficientNetGen(FeatureExtractor):
+    def __init__(self, model: str):
+        super(EfficientNetGen, self).__init__()
+        self.efficientnet = EfficientNet.from_pretrained(model)
+        self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
+        del self.efficientnet._fc
+    def features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.efficientnet.extract_features(x)
+        x = self.efficientnet._avg_pooling(x)
+        x = x.flatten(start_dim=1)
+        return x
+    def forward(self, x):
+        x = self.features(x)
+        x = self.efficientnet._dropout(x)
+        x = self.classifier(x)
+        return x
+class EfficientNetB4(EfficientNetGen):
+    def __init__(self):
+        super(EfficientNetB4, self).__init__(model='efficientnet-b4')
+"""
+EfficientNetAutoAtt
+"""
+class EfficientNetAutoAtt(EfficientNet):
+    def init_att(self, model: str, width: int):
+        """
+        Initialize attention
+        :param model: efficientnet-bx, x \in {0,..,7}
+        :param depth: attention width
+        :return:
+        """
+        if model == 'efficientnet-b4':
+            self.att_block_idx = 9
+            if width == 0:
+                self.attconv = nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)
+            else:
+                attconv_layers = []
+                for i in range(width):
+                    attconv_layers.append(
+                        ('conv{:d}'.format(i), nn.Conv2d(kernel_size=3, padding=1, in_channels=56, out_channels=56)))
+                    attconv_layers.append(
+                        ('relu{:d}'.format(i), nn.ReLU(inplace=True)))
+                attconv_layers.append(('conv_out', nn.Conv2d(kernel_size=1, in_channels=56, out_channels=1)))
+                self.attconv = nn.Sequential(OrderedDict(attconv_layers))
+        else:
+            raise ValueError('Model not valid: {}'.format(model))
+    def get_attention(self, x: torch.Tensor) -> torch.Tensor:
+        # Placeholder
+        att = None
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(x)))
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if idx == self.att_block_idx:
+                att = torch.sigmoid(self.attconv(x))
+                break
+        return att
+    def extract_features(self, x: torch.Tensor) -> torch.Tensor:
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(x)))
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+            if idx == self.att_block_idx:
+                att = torch.sigmoid(self.attconv(x))
+                x = x * att
+        # Head
+        x = self._swish(self._bn1(self._conv_head(x)))
+        return x
+class EfficientNetGenAutoAtt(FeatureExtractor):
+    def __init__(self, model: str, width: int):
+        super(EfficientNetGenAutoAtt, self).__init__()
+        self.efficientnet = EfficientNetAutoAtt.from_pretrained(model)
+        self.efficientnet.init_att(model, width)
+        self.classifier = nn.Linear(self.efficientnet._conv_head.out_channels, 1)
+        del self.efficientnet._fc
+    def features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.efficientnet.extract_features(x)
+        x = self.efficientnet._avg_pooling(x)
+        x = x.flatten(start_dim=1)
+        return x
+    def forward(self, x):
+        x = self.features(x)
+        x = self.efficientnet._dropout(x)
+        x = self.classifier(x)
+        return x
+    def get_attention(self, x: torch.Tensor) -> torch.Tensor:
+        return self.efficientnet.get_attention(x)
+class EfficientNetAutoAttB4(EfficientNetGenAutoAtt):
+    def __init__(self):
+        super(EfficientNetAutoAttB4, self).__init__(model='efficientnet-b4', width=0)
+"""
+Xception
+"""
+class Xception(FeatureExtractor):
+    def __init__(self):
+        super(Xception, self).__init__()
+        self.xception = externals.xception()
+        self.xception.last_linear = nn.Linear(2048, 1)
+    def features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.xception.features(x)
+        x = nn.ReLU(inplace=True)(x)
+        x = F.adaptive_avg_pool2d(x, (1, 1))
+        x = x.view(x.size(0), -1)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.xception.forward(x)
+"""
+Siamese tuning
+"""
+class SiameseTuning(FeatureExtractor):
+    def __init__(self, feat_ext: FeatureExtractor, num_feat: int, lastonly: bool = True):
+        super(SiameseTuning, self).__init__()
+        self.feat_ext = feat_ext()
+        if not hasattr(self.feat_ext, 'features'):
+            raise NotImplementedError('The provided feature extractor needs to provide a features() method')
+        self.lastonly = lastonly
+        self.classifier = nn.Sequential(
+            nn.BatchNorm1d(num_features=num_feat),
+            nn.Linear(in_features=num_feat, out_features=1),
+        )
+    def features(self, x):
+        x = self.feat_ext.features(x)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.lastonly:
+            with torch.no_grad():
+                x = self.features(x)
+        else:
+            x = self.features(x)
+        x = self.classifier(x)
+        return x
+    def get_trainable_parameters(self):
+        if self.lastonly:
+            return self.classifier.parameters()
+        else:
+            return self.parameters()
+class EfficientNetB4ST(SiameseTuning):
+    def __init__(self):
+        super(EfficientNetB4ST, self).__init__(feat_ext=EfficientNetB4, num_feat=1792, lastonly=True)
+class EfficientNetAutoAttB4ST(SiameseTuning):
+    def __init__(self):
+        super(EfficientNetAutoAttB4ST, self).__init__(feat_ext=EfficientNetAutoAttB4, num_feat=1792, lastonly=True)
+class XceptionST(SiameseTuning):
+    def __init__(self):
+        super(XceptionST, self).__init__(feat_ext=Xception, num_feat=2048, lastonly=True)

models/icpr2020dfdc/architectures/tripletnet.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+from . import fornet
+from .fornet import FeatureExtractor
+class TripletNet(FeatureExtractor):
+    """
+    Template class for triplet net
+    """
+    def __init__(self, feat_ext: FeatureExtractor):
+        super(TripletNet, self).__init__()
+        self.feat_ext = feat_ext()
+        if not hasattr(self.feat_ext, 'features'):
+            raise NotImplementedError('The provided feature extractor needs to provide a features() method')
+    def features(self, x):
+        return self.feat_ext.features(x)
+    def forward(self, x1, x2, x3):
+        x1 = self.features(x1)
+        x2 = self.features(x2)
+        x3 = self.features(x3)
+        return x1, x2, x3
+class EfficientNetB4(TripletNet):
+    def __init__(self):
+        super(EfficientNetB4, self).__init__(feat_ext=fornet.EfficientNetB4)
+class EfficientNetAutoAttB4(TripletNet):
+    def __init__(self):
+        super(EfficientNetAutoAttB4, self).__init__(feat_ext=fornet.EfficientNetAutoAttB4)

models/icpr2020dfdc/architectures/weights.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+weight_url = {
+'EfficientNetAutoAttB4ST_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4ST_DFDC_bestval-4df0ef7d2f380a5955affa78c35d0942ac1cd65229510353b252737775515a33.pth',
+'EfficientNetAutoAttB4ST_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4ST_FFPP_bestval-ddb357503b9b902e1b925c2550415604c4252b9b9ecafeb7369dc58cc16e9edd.pth',
+'EfficientNetAutoAttB4_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4_DFDC_bestval-72ed969b2a395fffe11a0d5bf0a635e7260ba2588c28683630d97ff7153389fc.pth',
+'EfficientNetAutoAttB4_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetAutoAttB4_FFPP_bestval-b0c9e9522a7143cf119843e910234be5e30f77dc527b1b427cdffa5ce3bdbc25.pth',
+'EfficientNetB4ST_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4ST_DFDC_bestval-86f0a0701b18694dfb5e7837bd09fa8e48a5146c193227edccf59f1b038181c6.pth',
+'EfficientNetB4ST_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4ST_FFPP_bestval-ccd016668071be5bf5fff68e446d055441739ec7113fb1a6eee998f08396ae92.pth',
+'EfficientNetB4_DFDC':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4_DFDC_bestval-c9f3663e2116d3356d056a0ce6453e0fc412a8df68ebd0902f07104d9129a09a.pth',
+'EfficientNetB4_FFPP':'https://f002.backblazeb2.com/file/icpr2020/EfficientNetB4_FFPP_bestval-93aaad84946829e793d1a67ed7e0309b535e2f2395acb4f8d16b92c0616ba8d7.pth',
+'Xception_DFDC':'https://f002.backblazeb2.com/file/icpr2020/Xception_DFDC_bestval-e826cdb64d73ef491e6b8ff8fce0e1e1b7fc1d8e2715bc51a56280fff17596f9.pth',
+'Xception_FFPP':'https://f002.backblazeb2.com/file/icpr2020/Xception_FFPP_bestval-bb119e4913cb8f816cd28a03f81f4c603d6351bf8e3f8e3eb99eebc923aecd22.pth',
+}

models/icpr2020dfdc/assets/cnfidfeyln_face.gif ADDED Viewed

Git LFS Details

SHA256: 09932133568f6d05897acd8ee8f406c638b8d4618efcf8719e8fc0cceeafc0ca
Pointer size: 132 Bytes
Size of remote file: 8.8 MB

models/icpr2020dfdc/assets/cnfidfeyln_face_att.gif ADDED Viewed

Git LFS Details

SHA256: 09abf3334cc8893b84b32ba78ddeb4ae5ead388ee044b1f41853af9b52612698
Pointer size: 132 Bytes
Size of remote file: 8.27 MB

models/icpr2020dfdc/assets/faces_attention.png ADDED Viewed

Git LFS Details

SHA256: b990e5fa8ef3bbd7105237ad29c82c173e73560f5c7d099d8753cad3a24d1ac9
Pointer size: 131 Bytes
Size of remote file: 560 kB

models/icpr2020dfdc/assets/mqzvfufzoq_face.gif ADDED Viewed

Git LFS Details

SHA256: 0b20deb0fc38243f897878e9e34d8868b82d0f8bdc0f5d7085addfd137c5ad04
Pointer size: 132 Bytes
Size of remote file: 8.73 MB

models/icpr2020dfdc/assets/mqzvfufzoq_face_att.gif ADDED Viewed

Git LFS Details

SHA256: fa6a4793b26555a43ba1c033951bf5540b8ae87d16ee7a0e3ae30d4948da9717
Pointer size: 132 Bytes
Size of remote file: 6.78 MB

models/icpr2020dfdc/blazeface/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .blazeface import BlazeFace
+from .face_extract import FaceExtractor
+from .read_video import VideoReader

models/icpr2020dfdc/blazeface/anchors.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a10bb2fb93ab54ca426d6c750bfc3aad685028a16dcf231357d03694f261fd95
+size 28800

models/icpr2020dfdc/blazeface/blazeface.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54ecff653feaaaf1f7d44b6aff28fd2fc50e483a4e847563b6dd261369c43ba4
+size 420224

models/icpr2020dfdc/blazeface/blazeface.py ADDED Viewed

	@@ -0,0 +1,417 @@

+from typing import List
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class BlazeBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
+        super(BlazeBlock, self).__init__()
+        self.stride = stride
+        self.channel_pad = out_channels - in_channels
+        # TFLite uses slightly different padding than PyTorch
+        # on the depthwise conv layer when the stride is 2.
+        if stride == 2:
+            self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+            padding = 0
+        else:
+            padding = (kernel_size - 1) // 2
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
+                      kernel_size=kernel_size, stride=stride, padding=padding,
+                      groups=in_channels, bias=True),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1, stride=1, padding=0, bias=True),
+        )
+        self.act = nn.ReLU(inplace=True)
+    def forward(self, x):
+        if self.stride == 2:
+            h = F.pad(x, (0, 2, 0, 2), "constant", 0)
+            x = self.max_pool(x)
+        else:
+            h = x
+        if self.channel_pad > 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0)
+        return self.act(self.convs(h) + x)
+class BlazeFace(nn.Module):
+    """The BlazeFace face detection model from MediaPipe.
+    The version from MediaPipe is simpler than the one in the paper;
+    it does not use the "double" BlazeBlocks.
+    Because we won't be training this model, it doesn't need to have
+    batchnorm layers. These have already been "folded" into the conv
+    weights by TFLite.
+    The conversion to PyTorch is fairly straightforward, but there are
+    some small differences between TFLite and PyTorch in how they handle
+    padding on conv layers with stride 2.
+    This version works on batches, while the MediaPipe version can only
+    handle a single image at a time.
+    Based on code from https://github.com/tkat0/PyTorch_BlazeFace/ and
+    https://github.com/google/mediapipe/
+    """
+    input_size = (128, 128)
+    detection_keys = [
+        'ymin', 'xmin', 'ymax', 'xmax',
+        'kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y',
+        'conf'
+    ]
+    def __init__(self):
+        super(BlazeFace, self).__init__()
+        # These are the settings from the MediaPipe example graph
+        # mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.score_clipping_thresh = 100.0
+        self.x_scale = 128.0
+        self.y_scale = 128.0
+        self.h_scale = 128.0
+        self.w_scale = 128.0
+        self.min_score_thresh = 0.75
+        self.min_suppression_threshold = 0.3
+        self._define_layers()
+    def _define_layers(self):
+        self.backbone1 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, stride=2, padding=0, bias=True),
+            nn.ReLU(inplace=True),
+            BlazeBlock(24, 24),
+            BlazeBlock(24, 28),
+            BlazeBlock(28, 32, stride=2),
+            BlazeBlock(32, 36),
+            BlazeBlock(36, 42),
+            BlazeBlock(42, 48, stride=2),
+            BlazeBlock(48, 56),
+            BlazeBlock(56, 64),
+            BlazeBlock(64, 72),
+            BlazeBlock(72, 80),
+            BlazeBlock(80, 88),
+        )
+        self.backbone2 = nn.Sequential(
+            BlazeBlock(88, 96, stride=2),
+            BlazeBlock(96, 96),
+            BlazeBlock(96, 96),
+            BlazeBlock(96, 96),
+            BlazeBlock(96, 96),
+        )
+        self.classifier_8 = nn.Conv2d(88, 2, 1, bias=True)
+        self.classifier_16 = nn.Conv2d(96, 6, 1, bias=True)
+        self.regressor_8 = nn.Conv2d(88, 32, 1, bias=True)
+        self.regressor_16 = nn.Conv2d(96, 96, 1, bias=True)
+    def forward(self, x):
+        # TFLite uses slightly different padding on the first conv layer
+        # than PyTorch, so do it manually.
+        x = F.pad(x, (1, 2, 1, 2), "constant", 0)
+        b = x.shape[0]  # batch size, needed for reshaping later
+        x = self.backbone1(x)  # (b, 88, 16, 16)
+        h = self.backbone2(x)  # (b, 96, 8, 8)
+        # Note: Because PyTorch is NCHW but TFLite is NHWC, we need to
+        # permute the output from the conv layers before reshaping it.
+        c1 = self.classifier_8(x)  # (b, 2, 16, 16)
+        c1 = c1.permute(0, 2, 3, 1)  # (b, 16, 16, 2)
+        c1 = c1.reshape(b, -1, 1)  # (b, 512, 1)
+        c2 = self.classifier_16(h)  # (b, 6, 8, 8)
+        c2 = c2.permute(0, 2, 3, 1)  # (b, 8, 8, 6)
+        c2 = c2.reshape(b, -1, 1)  # (b, 384, 1)
+        c = torch.cat((c1, c2), dim=1)  # (b, 896, 1)
+        r1 = self.regressor_8(x)  # (b, 32, 16, 16)
+        r1 = r1.permute(0, 2, 3, 1)  # (b, 16, 16, 32)
+        r1 = r1.reshape(b, -1, 16)  # (b, 512, 16)
+        r2 = self.regressor_16(h)  # (b, 96, 8, 8)
+        r2 = r2.permute(0, 2, 3, 1)  # (b, 8, 8, 96)
+        r2 = r2.reshape(b, -1, 16)  # (b, 384, 16)
+        r = torch.cat((r1, r2), dim=1)  # (b, 896, 16)
+        return [r, c]
+    def _device(self):
+        """Which device (CPU or GPU) is being used by this model?"""
+        return self.classifier_8.weight.device
+    def load_weights(self, path):
+        self.load_state_dict(torch.load(path))
+        self.eval()
+    def load_anchors(self, path):
+        self.anchors = torch.tensor(np.load(path), dtype=torch.float32, device=self._device())
+        assert (self.anchors.ndimension() == 2)
+        assert (self.anchors.shape[0] == self.num_anchors)
+        assert (self.anchors.shape[1] == 4)
+    def _preprocess(self, x):
+        """Converts the image pixels to the range [-1, 1]."""
+        return x.float() / 127.5 - 1.0
+    def predict_on_image(self, img):
+        """Makes a prediction on a single image.
+        Arguments:
+            img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
+                 shape (3, H, W). The image's height and width should be
+                 128 pixels.
+        Returns:
+            A tensor with face detections.
+        """
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute((2, 0, 1))
+        return self.predict_on_batch(img.unsqueeze(0))[0]
+    def predict_on_batch(self, x: np.ndarray or torch.Tensor, apply_nms: bool = True) -> List[torch.Tensor]:
+        """Makes a prediction on a batch of images.
+        Arguments:
+            x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
+               shape (b, 3, H, W). The height and width should be 128 pixels.
+            apply_nms: pass False to not apply non-max suppression
+        Returns:
+            A list containing a tensor of face detections for each image in
+            the batch. If no faces are found for an image, returns a tensor
+            of shape (0, 17).
+        Each face detection is a PyTorch tensor consisting of 17 numbers:
+            - ymin, xmin, ymax, xmax
+            - x,y-coordinates for the 6 keypoints
+            - confidence score
+        """
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x).permute((0, 3, 1, 2))
+        assert x.shape[1] == 3
+        assert x.shape[2] == 128
+        assert x.shape[3] == 128
+        # 1. Preprocess the images into tensors:
+        x = x.to(self._device())
+        x = self._preprocess(x)
+        # 2. Run the neural network:
+        with torch.no_grad():
+            out: torch.Tensor = self.__call__(x)
+        # 3. Postprocess the raw predictions:
+        detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+        # 4. Non-maximum suppression to remove overlapping detections:
+        return self.nms(detections) if apply_nms else detections
+    def nms(self, detections: List[torch.Tensor]) -> List[torch.Tensor]:
+        """Filters out overlapping detections."""
+        filtered_detections = []
+        for i in range(len(detections)):
+            faces = self._weighted_non_max_suppression(detections[i])
+            faces = torch.stack(faces) if len(faces) > 0 else torch.zeros((0, 17), device=self._device())
+            filtered_detections.append(faces)
+        return filtered_detections
+    def _tensors_to_detections(self, raw_box_tensor: torch.Tensor, raw_score_tensor: torch.Tensor, anchors) -> List[
+        torch.Tensor]:
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor
+        of shape (b, 896, 1) with the classification confidences.
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndimension() == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+        assert raw_score_tensor.ndimension() == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        thresh = self.score_clipping_thresh
+        raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
+        detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
+        # Note: we stripped off the last dimension from the scores tensor
+        # because there is only has one class. Now we can simply use a mask
+        # to filter out the boxes with too low confidence.
+        mask = detection_scores >= self.min_score_thresh
+        # Because each image from the batch can have a different number of
+        # detections, process them one at a time using a loop.
+        output_detections = []
+        for i in range(raw_box_tensor.shape[0]):
+            boxes = detection_boxes[i, mask[i]]
+            scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
+            output_detections.append(torch.cat((boxes, scores), dim=-1))
+        return output_detections
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = torch.zeros_like(raw_boxes)
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+        boxes[..., 0] = y_center - h / 2.  # ymin
+        boxes[..., 1] = x_center - w / 2.  # xmin
+        boxes[..., 2] = y_center + h / 2.  # ymax
+        boxes[..., 3] = x_center + w / 2.  # xmax
+        for k in range(6):
+            offset = 4 + k * 2
+            keypoint_x = raw_boxes[..., offset] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            keypoint_y = raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+            boxes[..., offset] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+        The input detections should be a Tensor of shape (count, 17).
+        Returns a list of PyTorch tensors, one for each detected face.
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0: return []
+        output_detections = []
+        # Sort the detections from highest to lowest score.
+        remaining = torch.argsort(detections[:, 16], descending=True)
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+            # Compute the overlap between the first box and the other
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.clone()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :16]
+                scores = detections[overlapping, 16:17]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(dim=0) / total_score
+                weighted_detection[:16] = weighted
+                weighted_detection[16] = total_score / len(overlapping)
+            output_detections.append(weighted_detection)
+        return output_detections
+    # IOU code from https://github.com/amdegroot/ssd.pytorch/blob/master/layers/box_utils.py
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2] - box_a[:, 0]) *
+              (box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2] - box_b[:, 0]) *
+              (box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)

models/icpr2020dfdc/blazeface/face_extract.py ADDED Viewed

	@@ -0,0 +1,470 @@

+import os
+from typing import Tuple, List
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from blazeface import BlazeFace
+class FaceExtractor:
+    """Wrapper for face extraction workflow."""
+    def __init__(self, video_read_fn = None, facedet: BlazeFace = None):
+        """Creates a new FaceExtractor.
+        Arguments:
+            video_read_fn: a function that takes in a path to a video file
+                and returns a tuple consisting of a NumPy array with shape
+                (num_frames, H, W, 3) and a list of frame indices, or None
+                in case of an error
+            facedet: the face detector object
+        """
+        self.video_read_fn = video_read_fn
+        self.facedet = facedet
+    def process_image(self, path: str = None, img: Image.Image or np.ndarray = None) -> dict:
+        """
+        Process a single image
+        :param path: Path to the image
+        :param img: image
+        :return:
+        """
+        if img is not None and path is not None:
+            raise ValueError('Only one argument between path and img can be specified')
+        if img is None and path is None:
+            raise ValueError('At least one argument between path and img must be specified')
+        target_size = self.facedet.input_size
+        if img is None:
+            img = np.asarray(Image.open(str(path)))
+        else:
+            img = np.asarray(img)
+        # Split the frames into several tiles. Resize the tiles to 128x128.
+        tiles, resize_info = self._tile_frames(np.expand_dims(img, 0), target_size)
+        # tiles has shape (num_tiles, target_size, target_size, 3)
+        # resize_info is a list of four elements [resize_factor_y, resize_factor_x, 0, 0]
+        # Run the face detector. The result is a list of PyTorch tensors,
+        # one for each tile in the batch.
+        detections = self.facedet.predict_on_batch(tiles, apply_nms=False)
+        # Convert the detections from 128x128 back to the original frame size.
+        detections = self._resize_detections(detections, target_size, resize_info)
+        # Because we have several tiles for each frame, combine the predictions
+        # from these tiles. The result is a list of PyTorch tensors, but now one
+        # for each frame (rather than each tile).
+        num_frames = 1
+        frame_size = (img.shape[1], img.shape[0])
+        detections = self._untile_detections(num_frames, frame_size, detections)
+        # The same face may have been detected in multiple tiles, so filter out
+        # overlapping detections. This is done separately for each frame.
+        detections = self.facedet.nms(detections)
+        # Crop the faces out of the original frame.
+        frameref_detections = self._add_margin_to_detections(detections[0], frame_size, 0.2)
+        faces = self._crop_faces(img, frameref_detections)
+        kpts = self._crop_kpts(img, detections[0], 0.3)
+        # Add additional information about the frame and detections.
+        scores = list(detections[0][:, 16].cpu().numpy())
+        frame_dict = {"frame_w": frame_size[0],
+                      "frame_h": frame_size[1],
+                      "faces": faces,
+                      "kpts": kpts,
+                      "detections": frameref_detections.cpu().numpy(),
+                      "scores": scores,
+                      }
+        # Sort faces by descending confidence
+        frame_dict = self._soft_faces_by_descending_score(frame_dict)
+        return frame_dict
+    def _soft_faces_by_descending_score(self, frame_dict: dict) -> dict:
+        if len(frame_dict['scores']) > 1:
+            sort_idxs = np.argsort(frame_dict['scores'])[::-1]
+            new_faces = [frame_dict['faces'][i] for i in sort_idxs]
+            new_kpts = [frame_dict['kpts'][i] for i in sort_idxs]
+            new_detections = frame_dict['detections'][sort_idxs]
+            new_scores = [frame_dict['scores'][i] for i in sort_idxs]
+            frame_dict['faces'] = new_faces
+            frame_dict['kpts'] = new_kpts
+            frame_dict['detections'] = new_detections
+            frame_dict['scores'] = new_scores
+        return frame_dict
+    def process_videos(self, input_dir, filenames, video_idxs) -> List[dict]:
+        """For the specified selection of videos, grabs one or more frames
+        from each video, runs the face detector, and tries to find the faces
+        in each frame.
+        The frames are split into tiles, and the tiles from the different videos
+        are concatenated into a single batch. This means the face detector gets
+        a batch of size len(video_idxs) * num_frames * num_tiles (usually 3).
+        Arguments:
+            input_dir: base folder where the video files are stored
+            filenames: list of all video files in the input_dir
+            video_idxs: one or more indices from the filenames list; these
+                are the videos we'll actually process
+        Returns a list of dictionaries, one for each frame read from each video.
+        This dictionary contains:
+            - video_idx: the video this frame was taken from
+            - frame_idx: the index of the frame in the video
+            - frame_w, frame_h: original dimensions of the frame
+            - faces: a list containing zero or more NumPy arrays with a face crop
+            - scores: a list array with the confidence score for each face crop
+        If reading a video failed for some reason, it will not appear in the
+        output array. Note that there's no guarantee a given video will actually
+        have num_frames results (as soon as a reading problem is encountered for
+        a video, we continue with the next video).
+        """
+        target_size = self.facedet.input_size
+        videos_read = []
+        frames_read = []
+        frames = []
+        tiles = []
+        resize_info = []
+        for video_idx in video_idxs:
+            # Read the full-size frames from this video.
+            filename = filenames[video_idx]
+            video_path = os.path.join(input_dir, filename)
+            result = self.video_read_fn(video_path)
+            # Error? Then skip this video.
+            if result is None: continue
+            videos_read.append(video_idx)
+            # Keep track of the original frames (need them later).
+            my_frames, my_idxs = result
+            frames.append(my_frames)
+            frames_read.append(my_idxs)
+            # Split the frames into several tiles. Resize the tiles to 128x128.
+            my_tiles, my_resize_info = self._tile_frames(my_frames, target_size)
+            tiles.append(my_tiles)
+            resize_info.append(my_resize_info)
+        if len(tiles) == 0:
+            return []
+        # Put all the tiles for all the frames from all the videos into
+        # a single batch.
+        batch = np.concatenate(tiles)
+        # Run the face detector. The result is a list of PyTorch tensors,
+        # one for each image in the batch.
+        all_detections = self.facedet.predict_on_batch(batch, apply_nms=False)
+        result = []
+        offs = 0
+        for v in range(len(tiles)):
+            # Not all videos may have the same number of tiles, so find which
+            # detections go with which video.
+            num_tiles = tiles[v].shape[0]
+            detections = all_detections[offs:offs + num_tiles]
+            offs += num_tiles
+            # Convert the detections from 128x128 back to the original frame size.
+            detections = self._resize_detections(detections, target_size, resize_info[v])
+            # Because we have several tiles for each frame, combine the predictions
+            # from these tiles. The result is a list of PyTorch tensors, but now one
+            # for each frame (rather than each tile).
+            num_frames = frames[v].shape[0]
+            frame_size = (frames[v].shape[2], frames[v].shape[1])
+            detections = self._untile_detections(num_frames, frame_size, detections)
+            # The same face may have been detected in multiple tiles, so filter out
+            # overlapping detections. This is done separately for each frame.
+            detections = self.facedet.nms(detections)
+            for i in range(len(detections)):
+                # Crop the faces out of the original frame.
+                frameref_detections = self._add_margin_to_detections(detections[i], frame_size, 0.2)
+                faces = self._crop_faces(frames[v][i], frameref_detections)
+                kpts = self._crop_kpts(frames[v][i], detections[i], 0.3)
+                # Add additional information about the frame and detections.
+                scores = list(detections[i][:, 16].cpu().numpy())
+                frame_dict = {"video_idx": videos_read[v],
+                              "frame_idx": frames_read[v][i],
+                              "frame_w": frame_size[0],
+                              "frame_h": frame_size[1],
+                              "frame": frames[v][i],
+                              "faces": faces,
+                              "kpts": kpts,
+                              "detections": frameref_detections.cpu().numpy(),
+                              "scores": scores,
+                              }
+                # Sort faces by descending confidence
+                frame_dict = self._soft_faces_by_descending_score(frame_dict)
+                result.append(frame_dict)
+        return result
+    def process_video(self, video_path):
+        """Convenience method for doing face extraction on a single video."""
+        input_dir = os.path.dirname(video_path)
+        filenames = [os.path.basename(video_path)]
+        return self.process_videos(input_dir, filenames, [0])
+    def _tile_frames(self, frames: np.ndarray, target_size: Tuple[int, int]) -> (np.ndarray, List[float]):
+        """Splits each frame into several smaller, partially overlapping tiles
+        and resizes each tile to target_size.
+        After a bunch of experimentation, I found that for a 1920x1080 video,
+        BlazeFace works better on three 1080x1080 windows. These overlap by 420
+        pixels. (Two windows also work but it's best to have a clean center crop
+        in there as well.)
+        I also tried 6 windows of size 720x720 (horizontally: 720|360, 360|720;
+        vertically: 720|1200, 480|720|480, 1200|720) but that gives many false
+        positives when a window has no face in it.
+        For a video in portrait orientation (1080x1920), we only take a single
+        crop of the top-most 1080 pixels. If we split up the video vertically,
+        then we might get false positives again.
+        (NOTE: Not all videos are necessarily 1080p but the code can handle this.)
+        Arguments:
+            frames: NumPy array of shape (num_frames, height, width, 3)
+            target_size: (width, height)
+        Returns:
+            - a new (num_frames * N, target_size[1], target_size[0], 3) array
+              where N is the number of tiles used.
+            - a list [scale_w, scale_h, offset_x, offset_y] that describes how
+              to map the resized and cropped tiles back to the original image
+              coordinates. This is needed for scaling up the face detections
+              from the smaller image to the original image, so we can take the
+              face crops in the original coordinate space.
+        """
+        num_frames, H, W, _ = frames.shape
+        num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)
+        splits = np.zeros((num_frames * num_v * num_h, target_size[1], target_size[0], 3), dtype=np.uint8)
+        i = 0
+        for f in range(num_frames):
+            y = 0
+            for v in range(num_v):
+                x = 0
+                for h in range(num_h):
+                    crop = frames[f, y:y + split_size, x:x + split_size, :]
+                    splits[i] = cv2.resize(crop, target_size, interpolation=cv2.INTER_AREA)
+                    x += x_step
+                    i += 1
+                y += y_step
+        resize_info = [split_size / target_size[0], split_size / target_size[1], 0, 0]
+        return splits, resize_info
+    def get_tiles_params(self, H, W):
+        split_size = min(H, W, 720)
+        x_step = (W - split_size) // 2
+        y_step = (H - split_size) // 2
+        num_v = (H - split_size) // y_step + 1 if y_step > 0 else 1
+        num_h = (W - split_size) // x_step + 1 if x_step > 0 else 1
+        return num_h, num_v, split_size, x_step, y_step
+    def _resize_detections(self, detections, target_size, resize_info):
+        """Converts a list of face detections back to the original
+        coordinate system.
+        Arguments:
+            detections: a list containing PyTorch tensors of shape (num_faces, 17)
+            target_size: (width, height)
+            resize_info: [scale_w, scale_h, offset_x, offset_y]
+        """
+        projected = []
+        target_w, target_h = target_size
+        scale_w, scale_h, offset_x, offset_y = resize_info
+        for i in range(len(detections)):
+            detection = detections[i].clone()
+            # ymin, xmin, ymax, xmax
+            for k in range(2):
+                detection[:, k * 2] = (detection[:, k * 2] * target_h - offset_y) * scale_h
+                detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_w - offset_x) * scale_w
+            # keypoints are x,y
+            for k in range(2, 8):
+                detection[:, k * 2] = (detection[:, k * 2] * target_w - offset_x) * scale_w
+                detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_h - offset_y) * scale_h
+            projected.append(detection)
+        return projected
+    def _untile_detections(self, num_frames: int, frame_size: Tuple[int, int], detections: List[torch.Tensor]) -> List[
+        torch.Tensor]:
+        """With N tiles per frame, there also are N times as many detections.
+        This function groups together the detections for a given frame; it is
+        the complement to tile_frames().
+        """
+        combined_detections = []
+        W, H = frame_size
+        num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)
+        i = 0
+        for f in range(num_frames):
+            detections_for_frame = []
+            y = 0
+            for v in range(num_v):
+                x = 0
+                for h in range(num_h):
+                    # Adjust the coordinates based on the split positions.
+                    detection = detections[i].clone()
+                    if detection.shape[0] > 0:
+                        for k in range(2):
+                            detection[:, k * 2] += y
+                            detection[:, k * 2 + 1] += x
+                        for k in range(2, 8):
+                            detection[:, k * 2] += x
+                            detection[:, k * 2 + 1] += y
+                    detections_for_frame.append(detection)
+                    x += x_step
+                    i += 1
+                y += y_step
+            combined_detections.append(torch.cat(detections_for_frame))
+        return combined_detections
+    def _add_margin_to_detections(self, detections: torch.Tensor, frame_size: Tuple[int, int],
+                                  margin: float = 0.2) -> torch.Tensor:
+        """Expands the face bounding box.
+        NOTE: The face detections often do not include the forehead, which
+        is why we use twice the margin for ymin.
+        Arguments:
+            detections: a PyTorch tensor of shape (num_detections, 17)
+            frame_size: maximum (width, height)
+            margin: a percentage of the bounding box's height
+        Returns a PyTorch tensor of shape (num_detections, 17).
+        """
+        offset = torch.round(margin * (detections[:, 2] - detections[:, 0]))
+        detections = detections.clone()
+        detections[:, 0] = torch.clamp(detections[:, 0] - offset * 2, min=0)  # ymin
+        detections[:, 1] = torch.clamp(detections[:, 1] - offset, min=0)  # xmin
+        detections[:, 2] = torch.clamp(detections[:, 2] + offset, max=frame_size[1])  # ymax
+        detections[:, 3] = torch.clamp(detections[:, 3] + offset, max=frame_size[0])  # xmax
+        return detections
+    def _crop_faces(self, frame: np.ndarray, detections: torch.Tensor) -> List[np.ndarray]:
+        """Copies the face region(s) from the given frame into a set
+        of new NumPy arrays.
+        Arguments:
+            frame: a NumPy array of shape (H, W, 3)
+            detections: a PyTorch tensor of shape (num_detections, 17)
+        Returns a list of NumPy arrays, one for each face crop. If there
+        are no faces detected for this frame, returns an empty list.
+        """
+        faces = []
+        for i in range(len(detections)):
+            ymin, xmin, ymax, xmax = detections[i, :4].cpu().numpy().astype(int)
+            face = frame[ymin:ymax, xmin:xmax, :]
+            faces.append(face)
+        return faces
+    def _crop_kpts(self, frame: np.ndarray, detections: torch.Tensor, face_fraction: float):
+        """Copies the parts region(s) from the given frame into a set
+        of new NumPy arrays.
+        Arguments:
+            frame: a NumPy array of shape (H, W, 3)
+            detections: a PyTorch tensor of shape (num_detections, 17)
+            face_fraction: float between 0 and 1 indicating how big are the parts to be extracted w.r.t the whole face
+        Returns a list of NumPy arrays, one for each face crop. If there
+        are no faces detected for this frame, returns an empty list.
+        """
+        faces = []
+        for i in range(len(detections)):
+            kpts = []
+            size = int(face_fraction * min(detections[i, 2] - detections[i, 0], detections[i, 3] - detections[i, 1]))
+            kpts_coords = detections[i, 4:16].cpu().numpy().astype(int)
+            for kpidx in range(6):
+                kpx, kpy = kpts_coords[kpidx * 2:kpidx * 2 + 2]
+                kpt = frame[kpy - size // 2:kpy - size // 2 + size, kpx - size // 2:kpx - size // 2 + size, ]
+                kpts.append(kpt)
+            faces.append(kpts)
+        return faces
+    def remove_large_crops(self, crops, pct=0.1):
+        """Removes faces from the results if they take up more than X%
+        of the video. Such a face is likely a false positive.
+        This is an optional postprocessing step. Modifies the original
+        data structure.
+        Arguments:
+            crops: a list of dictionaries with face crop data
+            pct: maximum portion of the frame a crop may take up
+        """
+        for i in range(len(crops)):
+            frame_data = crops[i]
+            video_area = frame_data["frame_w"] * frame_data["frame_h"]
+            faces = frame_data["faces"]
+            scores = frame_data["scores"]
+            new_faces = []
+            new_scores = []
+            for j in range(len(faces)):
+                face = faces[j]
+                face_H, face_W, _ = face.shape
+                face_area = face_H * face_W
+                if face_area / video_area < 0.1:
+                    new_faces.append(face)
+                    new_scores.append(scores[j])
+            frame_data["faces"] = new_faces
+            frame_data["scores"] = new_scores
+    def keep_only_best_face(self, crops):
+        """For each frame, only keeps the face with the highest confidence.
+        This gets rid of false positives, but obviously is problematic for
+        videos with two people!
+        This is an optional postprocessing step. Modifies the original
+        data structure.
+        """
+        for i in range(len(crops)):
+            frame_data = crops[i]
+            if len(frame_data["faces"]) > 0:
+                frame_data["faces"] = frame_data["faces"][:1]
+                frame_data["scores"] = frame_data["scores"][:1]
+    # TODO: def filter_likely_false_positives(self, crops):
+    #   if only some frames have more than 1 face, it's likely a false positive
+    #   if most frames have more than 1 face, it's probably two people
+    #   so find the % of frames with > 1 face; if > 0.X, keep the two best faces
+    # TODO: def filter_by_score(self, crops, min_score) to remove any
+    # crops with a confidence score lower than min_score
+    # TODO: def sort_by_histogram(self, crops) for videos with 2 people.

models/icpr2020dfdc/blazeface/read_video.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import cv2
+import numpy as np
+class VideoReader:
+    """Helper class for reading one or more frames from a video file."""
+    def __init__(self, verbose=True, insets=(0, 0)):
+        """Creates a new VideoReader.
+        Arguments:
+            verbose: whether to print warnings and error messages
+            insets: amount to inset the image by, as a percentage of
+                (width, height). This lets you "zoom in" to an image
+                to remove unimportant content around the borders.
+                Useful for face detection, which may not work if the
+                faces are too small.
+        """
+        self.verbose = verbose
+        self.insets = insets
+    def read_frames(self, path, num_frames, jitter=0, seed=None):
+        """Reads frames that are always evenly spaced throughout the video.
+        Arguments:
+            path: the video file
+            num_frames: how many frames to read, -1 means the entire video
+                (warning: this will take up a lot of memory!)
+            jitter: if not 0, adds small random offsets to the frame indices;
+                this is useful so we don't always land on even or odd frames
+            seed: random seed for jittering; if you set this to a fixed value,
+                you probably want to set it only on the first video
+        """
+        assert num_frames > 0
+        capture = cv2.VideoCapture(path)
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        if frame_count <= 0: return None
+        frame_idxs = np.linspace(0, frame_count - 1, num_frames, endpoint=True, dtype=int)
+        frame_idxs = np.unique(frame_idxs)  # Avoid repeating frame idxs otherwise it breaks reading
+        if jitter > 0:
+            np.random.seed(seed)
+            jitter_offsets = np.random.randint(-jitter, jitter, len(frame_idxs))
+            frame_idxs = np.clip(frame_idxs + jitter_offsets, 0, frame_count - 1)
+        result = self._read_frames_at_indices(path, capture, frame_idxs)
+        capture.release()
+        return result
+    def read_random_frames(self, path, num_frames, seed=None):
+        """Picks the frame indices at random.
+        Arguments:
+            path: the video file
+            num_frames: how many frames to read, -1 means the entire video
+                (warning: this will take up a lot of memory!)
+        """
+        assert num_frames > 0
+        np.random.seed(seed)
+        capture = cv2.VideoCapture(path)
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        if frame_count <= 0: return None
+        frame_idxs = sorted(np.random.choice(np.arange(0, frame_count), num_frames))
+        result = self._read_frames_at_indices(path, capture, frame_idxs)
+        capture.release()
+        return result
+    def read_frames_at_indices(self, path, frame_idxs):
+        """Reads frames from a video and puts them into a NumPy array.
+        Arguments:
+            path: the video file
+            frame_idxs: a list of frame indices. Important: should be
+                sorted from low-to-high! If an index appears multiple
+                times, the frame is still read only once.
+        Returns:
+            - a NumPy array of shape (num_frames, height, width, 3)
+            - a list of the frame indices that were read
+        Reading stops if loading a frame fails, in which case the first
+        dimension returned may actually be less than num_frames.
+        Returns None if an exception is thrown for any reason, or if no
+        frames were read.
+        """
+        assert len(frame_idxs) > 0
+        capture = cv2.VideoCapture(path)
+        result = self._read_frames_at_indices(path, capture, frame_idxs)
+        capture.release()
+        return result
+    def _read_frames_at_indices(self, path, capture, frame_idxs):
+        try:
+            frames = []
+            idxs_read = []
+            for frame_idx in range(frame_idxs[0], frame_idxs[-1] + 1):
+                # Get the next frame, but don't decode if we're not using it.
+                ret = capture.grab()
+                if not ret:
+                    if self.verbose:
+                        print("Error grabbing frame %d from movie %s" % (frame_idx, path))
+                    break
+                # Need to look at this frame?
+                current = len(idxs_read)
+                if frame_idx == frame_idxs[current]:
+                    ret, frame = capture.retrieve()
+                    if not ret or frame is None:
+                        if self.verbose:
+                            print("Error retrieving frame %d from movie %s" % (frame_idx, path))
+                        break
+                    frame = self._postprocess_frame(frame)
+                    frames.append(frame)
+                    idxs_read.append(frame_idx)
+            if len(frames) > 0:
+                return np.stack(frames), idxs_read
+            if self.verbose:
+                print("No frames read from movie %s" % path)
+            return None
+        except:
+            if self.verbose:
+                print("Exception while reading movie %s" % path)
+            return None
+    def read_middle_frame(self, path):
+        """Reads the frame from the middle of the video."""
+        capture = cv2.VideoCapture(path)
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        result = self._read_frame_at_index(path, capture, frame_count // 2)
+        capture.release()
+        return result
+    def read_frame_at_index(self, path, frame_idx):
+        """Reads a single frame from a video.
+        If you just want to read a single frame from the video, this is more
+        efficient than scanning through the video to find the frame. However,
+        for reading multiple frames it's not efficient.
+        My guess is that a "streaming" approach is more efficient than a
+        "random access" approach because, unless you happen to grab a keyframe,
+        the decoder still needs to read all the previous frames in order to
+        reconstruct the one you're asking for.
+        Returns a NumPy array of shape (1, H, W, 3) and the index of the frame,
+        or None if reading failed.
+        """
+        capture = cv2.VideoCapture(path)
+        result = self._read_frame_at_index(path, capture, frame_idx)
+        capture.release()
+        return result
+    def _read_frame_at_index(self, path, capture, frame_idx):
+        capture.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+        ret, frame = capture.read()
+        if not ret or frame is None:
+            if self.verbose:
+                print("Error retrieving frame %d from movie %s" % (frame_idx, path))
+            return None
+        else:
+            frame = self._postprocess_frame(frame)
+            return np.expand_dims(frame, axis=0), [frame_idx]
+    def _postprocess_frame(self, frame):
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if self.insets[0] > 0:
+            W = frame.shape[1]
+            p = int(W * self.insets[0])
+            frame = frame[:, p:-p, :]
+        if self.insets[1] > 0:
+            H = frame.shape[1]
+            q = int(H * self.insets[1])
+            frame = frame[q:-q, :, :]
+        return frame
+class VideoReaderIspl(VideoReader):
+    """
+    Derived VideoReader class with overriden read_frames method
+    """
+    def read_frames_with_hop(self, path: str, num_frames: int = -1, fps: int = -1):
+        """Reads frames up to a certain number spaced throughout the video with a rate decided by the user.
+        Arguments:
+            path: the video file
+            num_frames: how many frames to read, -1 means the entire video
+                (warning: this will take up a lot of memory!)
+            fps: how many frames per second to pick
+        """
+        assert num_frames > 0
+        capture = cv2.VideoCapture(path)
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        if frame_count <= 0: return None
+        video_rate = capture.get(cv2.CAP_PROP_FPS)
+        hop = 1 if fps == -1 else max(video_rate // fps, 1)
+        end_pts = frame_count if num_frames == -1 else num_frames * hop
+        frame_idxs = np.arange(0, end_pts - 1, hop, endpoint=True, dtype=int)
+        result = self._read_frames_at_indices(path, capture, frame_idxs)
+        capture.release()
+        return result

models/icpr2020dfdc/environment.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: icpr2020
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - av=6.2.0
+  - albumentations
+  - cudatoolkit
+  - ffmpeg
+  - jupyter
+  - numpy
+  - opencv=3.4.2
+  - py-opencv=3.4.2
+  - python=3.6.9
+  - pip
+  - pytorch=1.4.0
+  - torchvision
+  - tqdm
+  - pandas
+  - pip:
+    - tensorboardx==2.0
+    - efficientnet-pytorch
+    - scikit-learn

models/icpr2020dfdc/extract_faces.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""
+Extract faces
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+import sys
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from pathlib import Path
+from typing import Tuple, List
+import numpy as np
+import pandas as pd
+import torch
+import torch.cuda
+from PIL import Image
+from tqdm import tqdm
+import blazeface
+from blazeface import BlazeFace, VideoReader, FaceExtractor
+from isplutils.utils import adapt_bb
+def parse_args(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=Path, help='Videos root directory', required=True)
+    parser.add_argument('--videodf', type=Path, help='Path to read the videos DataFrame', required=True)
+    parser.add_argument('--facesfolder', type=Path, help='Faces output root directory', required=True)
+    parser.add_argument('--facesdf', type=Path, help='Path to save the output DataFrame of faces', required=True)
+    parser.add_argument('--checkpoint', type=Path, help='Path to save the temporary per-video outputs', required=True)
+    parser.add_argument('--fpv', type=int, default=32, help='Frames per video')
+    parser.add_argument('--device', type=torch.device,
+                        default=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
+                        help='Device to use for face extraction')
+    parser.add_argument('--collateonly', help='Only perform collation of pre-existing results', action='store_true')
+    parser.add_argument('--noindex', help='Do not rebuild the index', action='store_false')
+    parser.add_argument('--batch', type=int, help='Batch size', default=16)
+    parser.add_argument('--threads', type=int, help='Number of threads', default=8)
+    parser.add_argument('--offset', type=int, help='Offset to start extraction', default=0)
+    parser.add_argument('--num', type=int, help='Number of videos to process', default=0)
+    parser.add_argument('--lazycheck', action='store_true', help='Lazy check of existing video indexes')
+    parser.add_argument('--deepcheck', action='store_true', help='Try to open every image')
+    return parser.parse_args(argv)
+def main(argv):
+    args = parse_args(argv)
+    ## Parameters parsing
+    device: torch.device = args.device
+    source_dir: Path = args.source
+    facedestination_dir: Path = args.facesfolder
+    frames_per_video: int = args.fpv
+    videodataset_path: Path = args.videodf
+    facesdataset_path: Path = args.facesdf
+    collateonly: bool = args.collateonly
+    batch_size: int = args.batch
+    threads: int = args.threads
+    offset: int = args.offset
+    num: int = args.num
+    lazycheck: bool = args.lazycheck
+    deepcheck: bool = args.deepcheck
+    checkpoint_folder: Path = args.checkpoint
+    index_enable: bool = args.noindex
+    ## Parameters
+    face_size = 512
+    print('Loading video DataFrame')
+    df_videos = pd.read_pickle(videodataset_path)
+    if num > 0:
+        df_videos_process = df_videos.iloc[offset:offset + num]
+    else:
+        df_videos_process = df_videos.iloc[offset:]
+    if not collateonly:
+        ## Blazeface loading
+        print('Loading face extractor')
+        facedet = BlazeFace().to(device)
+        facedet.load_weights("blazeface/blazeface.pth")
+        facedet.load_anchors("blazeface/anchors.npy")
+        videoreader = VideoReader(verbose=False)
+        video_read_fn = lambda x: videoreader.read_frames(x, num_frames=frames_per_video)
+        face_extractor = FaceExtractor(video_read_fn, facedet)
+        ## Face extraction
+        with ThreadPoolExecutor(threads) as p:
+            for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos_process), step=batch_size),
+                                   desc='Extracting faces'):
+                tosave_list = list(p.map(partial(process_video,
+                                                 source_dir=source_dir,
+                                                 facedestination_dir=facedestination_dir,
+                                                 checkpoint_folder=checkpoint_folder,
+                                                 face_size=face_size,
+                                                 face_extractor=face_extractor,
+                                                 lazycheck=lazycheck,
+                                                 deepcheck=deepcheck,
+                                                 ),
+                                         df_videos_process.iloc[batch_idx0:batch_idx0 + batch_size].iterrows()))
+                for tosave in tosave_list:
+                    if tosave is not None:
+                        if len(tosave[2]):
+                            list(p.map(save_jpg, tosave[2]))
+                        tosave[1].parent.mkdir(parents=True, exist_ok=True)
+                        tosave[0].to_pickle(str(tosave[1]))
+    if index_enable:
+        # Collect checkpoints
+        df_videos['nfaces'] = np.zeros(len(df_videos), np.uint8)
+        faces_dataset = []
+        for idx, record in tqdm(df_videos.iterrows(), total=len(df_videos), desc='Collecting faces results'):
+            # Checkpoint
+            video_face_checkpoint_path = checkpoint_folder.joinpath(record['path']).with_suffix('.faces.pkl')
+            if video_face_checkpoint_path.exists():
+                try:
+                    df_video_faces = pd.read_pickle(str(video_face_checkpoint_path))
+                    # Fix same attribute issue
+                    df_video_faces = df_video_faces.rename(columns={'subject': 'videosubject'}, errors='ignore')
+                    nfaces = len(
+                        np.unique(df_video_faces.index.map(lambda x: int(x.split('_subj')[1].split('.jpg')[0]))))
+                    df_videos.loc[idx, 'nfaces'] = nfaces
+                    faces_dataset.append(df_video_faces)
+                except Exception as e:
+                    print('Error while reading: {}'.format(video_face_checkpoint_path))
+                    print(e)
+                    video_face_checkpoint_path.unlink()
+        if len(faces_dataset) == 0:
+            raise ValueError(f'No checkpoint found from face extraction. '
+                             f'Is the the source path {source_dir} correct for the videos in your dataframe?')
+        # Save videos with updated faces
+        print('Saving videos DataFrame to {}'.format(videodataset_path))
+        df_videos.to_pickle(str(videodataset_path))
+        if offset > 0:
+            if num > 0:
+                if facesdataset_path.is_dir():
+                    facesdataset_path = facesdataset_path.joinpath(
+                        'faces_df_from_video_{}_to_video_{}.pkl'.format(offset, num + offset))
+                else:
+                    facesdataset_path = facesdataset_path.parent.joinpath(
+                        str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}_to_video_{}.pkl'.format(offset,
+                                                                                                                 num + offset))
+            else:
+                if facesdataset_path.is_dir():
+                    facesdataset_path = facesdataset_path.joinpath('faces_df_from_video_{}.pkl'.format(offset))
+                else:
+                    facesdataset_path = facesdataset_path.parent.joinpath(
+                        str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}.pkl'.format(offset))
+        elif num > 0:
+            if facesdataset_path.is_dir():
+                facesdataset_path = facesdataset_path.joinpath(
+                    'faces_df_from_video_{}_to_video_{}.pkl'.format(0, num))
+            else:
+                facesdataset_path = facesdataset_path.parent.joinpath(
+                    str(facesdataset_path.parts[-1]).split('.')[0] + '_from_video_{}_to_video_{}.pkl'.format(0, num))
+        else:
+            if facesdataset_path.is_dir():
+                facesdataset_path = facesdataset_path.joinpath('faces_df.pkl')  # just a check if the path is a dir
+        # Creates directory (if doesn't exist)
+        facesdataset_path.parent.mkdir(parents=True, exist_ok=True)
+        print('Saving faces DataFrame to {}'.format(facesdataset_path))
+        df_faces = pd.concat(faces_dataset, axis=0, )
+        df_faces['video'] = df_faces['video'].astype('category')
+        for key in ['kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x',
+                    'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left',
+                    'top', 'right', 'bottom', ]:
+            df_faces[key] = df_faces[key].astype(np.int16)
+        df_faces['videosubject'] = df_faces['videosubject'].astype(np.int8)
+        # Eventually remove duplicates
+        df_faces = df_faces.loc[~df_faces.index.duplicated(keep='first')]
+        fields_to_preserve_from_video = [i for i in
+                                         ['folder', 'subject', 'scene', 'cluster', 'nfaces', 'test'] if
+                                         i in df_videos]
+        df_faces = pd.merge(df_faces, df_videos[fields_to_preserve_from_video], left_on='video',
+                            right_index=True)
+        df_faces.to_pickle(str(facesdataset_path))
+    print('Completed!')
+def save_jpg(args: Tuple[Image.Image, Path or str]):
+    image, path = args
+    image.save(path, quality=95, subsampling='4:4:4')
+def process_video(item: Tuple[pd.Index, pd.Series],
+                  source_dir: Path,
+                  facedestination_dir: Path,
+                  checkpoint_folder: Path,
+                  face_size: int,
+                  face_extractor: FaceExtractor,
+                  lazycheck: bool = False,
+                  deepcheck: bool = False,
+                  ) -> (pd.DataFrame, Path, List[Tuple[Image.Image, Path]]) or None:
+    # Instatiate Index and Series
+    idx, record = item
+    # Checkpoint
+    video_faces_checkpoint_path = checkpoint_folder.joinpath(record['path']).with_suffix('.faces.pkl')
+    if not lazycheck:
+        if video_faces_checkpoint_path.exists():
+            try:
+                df_video_faces = pd.read_pickle(str(video_faces_checkpoint_path))
+                for _, r in df_video_faces.iterrows():
+                    face_path = facedestination_dir.joinpath(r.name)
+                    assert (face_path.exists())
+                    if deepcheck:
+                        img = Image.open(face_path)
+                        img_arr = np.asarray(img)
+                        assert (img_arr.ndim == 3)
+                        assert (np.prod(img_arr.shape) > 0)
+            except Exception as e:
+                print('Error while checking: {}'.format(video_faces_checkpoint_path))
+                print(e)
+                video_faces_checkpoint_path.unlink()
+    if not (video_faces_checkpoint_path.exists()):
+        try:
+            video_face_dict_list = []
+            # Load faces
+            current_video_path = source_dir.joinpath(record['path'])
+            if not current_video_path.exists():
+                raise FileNotFoundError(f'Unable to find {current_video_path}.'
+                                        f'Are you sure that {source_dir} is the correct source directory for the video '
+                                        f'you indexed in the dataframe?')
+            frames = face_extractor.process_video(current_video_path)
+            if len(frames) == 0:
+                return
+            face_extractor.keep_only_best_face(frames)
+            for frame_idx, frame in enumerate(frames):
+                frames[frame_idx]['subjects'] = [0] * len(frames[frame_idx]['detections'])
+            # Extract and save faces, bounding boxes, keypoints
+            images_to_save: List[Tuple[Image.Image, Path]] = []
+            for frame_idx, frame in enumerate(frames):
+                if len(frames[frame_idx]['detections']):
+                    fullframe = Image.fromarray(frames[frame_idx]['frame'])
+                    # Preserve the only found face even if not a good one, otherwise preserve only clusters > -1
+                    subjects = np.unique(frames[frame_idx]['subjects'])
+                    if len(subjects) > 1:
+                        subjects = np.asarray([s for s in subjects if s > -1])
+                    for face_idx, _ in enumerate(frame['faces']):
+                        subj_id = frames[frame_idx]['subjects'][face_idx]
+                        if subj_id in subjects:  # Exclude outliers if other faces detected
+                            face_path = facedestination_dir.joinpath(record['path'], 'fr{:03d}_subj{:1d}.jpg'.format(
+                                frames[frame_idx]['frame_idx'], subj_id))
+                            face_dict = {'facepath': str(face_path.relative_to(facedestination_dir)), 'video': idx,
+                                         'label': record['label'], 'videosubject': subj_id,
+                                         'original': record['original']}
+                            # add attibutes for ff++
+                            if 'class' in record.keys():
+                                face_dict.update({'class': record['class']})
+                            if 'source' in record.keys():
+                                face_dict.update({'source': record['source']})
+                            if 'quality' in record.keys():
+                                face_dict.update({'quality': record['quality']})
+                            for field_idx, key in enumerate(blazeface.BlazeFace.detection_keys):
+                                face_dict[key] = frames[frame_idx]['detections'][face_idx][field_idx]
+                            cropping_bb = adapt_bb(frame_height=fullframe.height,
+                                                   frame_width=fullframe.width,
+                                                   bb_height=face_size,
+                                                   bb_width=face_size,
+                                                   left=face_dict['xmin'],
+                                                   top=face_dict['ymin'],
+                                                   right=face_dict['xmax'],
+                                                   bottom=face_dict['ymax'])
+                            face = fullframe.crop(cropping_bb)
+                            for key in blazeface.BlazeFace.detection_keys:
+                                if (key[0] == 'k' and key[-1] == 'x') or (key[0] == 'x'):
+                                    face_dict[key] -= cropping_bb[0]
+                                elif (key[0] == 'k' and key[-1] == 'y') or (key[0] == 'y'):
+                                    face_dict[key] -= cropping_bb[1]
+                            face_dict['left'] = face_dict.pop('xmin')
+                            face_dict['top'] = face_dict.pop('ymin')
+                            face_dict['right'] = face_dict.pop('xmax')
+                            face_dict['bottom'] = face_dict.pop('ymax')
+                            face_path.parent.mkdir(parents=True, exist_ok=True)
+                            images_to_save.append((face, face_path))
+                            video_face_dict_list.append(face_dict)
+            if len(video_face_dict_list) > 0:
+                df_video_faces = pd.DataFrame(video_face_dict_list)
+                df_video_faces.index = df_video_faces['facepath']
+                del df_video_faces['facepath']
+                # type conversions
+                for key in ['kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y',
+                            'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left', 'top',
+                            'right', 'bottom']:
+                    df_video_faces[key] = df_video_faces[key].astype(np.int16)
+                df_video_faces['conf'] = df_video_faces['conf'].astype(np.float32)
+                df_video_faces['video'] = df_video_faces['video'].astype('category')
+                video_faces_checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
+            else:
+                print('No faces extracted for video {}'.format(record['path']))
+                df_video_faces = pd.DataFrame()
+            return df_video_faces, video_faces_checkpoint_path, images_to_save
+        except Exception as e:
+            print('Error while processing: {}'.format(record['path']))
+            print("-" * 60)
+            traceback.print_exc(file=sys.stdout, limit=5)
+            print("-" * 60)
+            return
+if __name__ == '__main__':
+    main(sys.argv[1:])

models/icpr2020dfdc/index_celebdf.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Index Celeb-DF v2
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+from multiprocessing import Pool
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from isplutils.utils import extract_meta_av, extract_meta_cv
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=Path, help='Source dir',
+                        required=True)
+    parser.add_argument('--videodataset', type=Path, default='data/celebdf_videos.pkl',
+                        help='Path to save the videos DataFrame')
+    args = parser.parse_args()
+    ## Parameters parsing
+    source_dir: Path = args.source
+    videodataset_path: Path = args.videodataset
+    # Create ouput folder (if doesn't exist)
+    videodataset_path.parent.mkdir(parents=True, exist_ok=True)
+    ## DataFrame
+    if videodataset_path.exists():
+        print('Loading video DataFrame')
+        df_videos = pd.read_pickle(videodataset_path)
+    else:
+        print('Creating video DataFrame')
+        split_file = Path(source_dir).joinpath('List_of_testing_videos.txt')
+        if not split_file.exists():
+            raise FileNotFoundError('Unable to find "List_of_testing_videos.txt" in {}'.format(source_dir))
+        test_videos_df = pd.read_csv(split_file, delimiter=' ', header=0, index_col=1)
+        ff_videos = Path(source_dir).rglob('*.mp4')
+        df_videos = pd.DataFrame(
+            {'path': [f.relative_to(source_dir) for f in ff_videos]})
+        df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
+        with Pool() as p:
+            meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
+        meta = np.stack(meta)
+        df_videos.loc[:, ['height', 'width', 'frames']] = meta
+        # Fix for videos that av cannot decode properly
+        for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
+            meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
+            df_videos.loc[idx, ['height', 'width', 'frames']] = meta
+        df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
+        df_videos['label'] = df_videos['class'].map(
+            lambda x: True if x == 'Celeb-synthesis' else False)  # True is FAKE, False is REAL
+        df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').name)
+        df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
+        df_videos.loc[(df_videos['label'] == True), 'original'] = \
+            df_videos[(df_videos['label'] == True)]['name'].map(
+                lambda x: df_videos.index[
+                    np.flatnonzero(df_videos['name'] == '_'.join([x.split('_')[0], x.split('_')[2]]))[0]]
+            )
+        df_videos['test'] = df_videos['path'].map(str).isin(test_videos_df.index)
+        print('Saving video DataFrame to {}'.format(videodataset_path))
+        df_videos.to_pickle(str(videodataset_path))
+    print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
+    print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
+if __name__ == '__main__':
+    main()

models/icpr2020dfdc/index_dfdc.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Index the official Kaggle training dataset and prepares a train and validation set based on folders
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import sys
+import argparse
+from multiprocessing import Pool
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from isplutils.utils import extract_meta_av
+def parse_args(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=Path, help='Source dir', required=True)
+    parser.add_argument('--videodataset', type=Path, default='data/dfdc_videos.pkl',
+                        help='Path to save the videos DataFrame')
+    parser.add_argument('--batch', type=int, help='Batch size', default=64)
+    return parser.parse_args(argv)
+def main(argv):
+    ## Parameters parsing
+    args = parse_args(argv)
+    source_dir: Path = args.source
+    videodataset_path: Path = args.videodataset
+    batch_size: int = args.batch
+    ## DataFrame
+    if videodataset_path.exists():
+        print('Loading video DataFrame')
+        df_videos = pd.read_pickle(videodataset_path)
+    else:
+        print('Creating video DataFrame')
+        # Create ouptut folder
+        videodataset_path.parent.mkdir(parents=True, exist_ok=True)
+        # Index
+        df_train_list = list()
+        for idx, json_path in enumerate(tqdm(sorted(source_dir.rglob('metadata.json')), desc='Indexing')):
+            df_tmp = pd.read_json(json_path, orient='index')
+            df_tmp['path'] = df_tmp.index.map(
+                lambda x: str(json_path.parent.relative_to(source_dir).joinpath(x)))
+            df_tmp['folder'] = int(str(json_path.parts[-2]).split('_')[-1])
+            df_train_list.append(df_tmp)
+        df_videos = pd.concat(df_train_list, axis=0, verify_integrity=True)
+        # Save space
+        del df_videos['split']
+        df_videos['label'] = df_videos['label'] == 'FAKE'
+        df_videos['original'] = df_videos['original'].astype('category')
+        df_videos['folder'] = df_videos['folder'].astype(np.uint8)
+        # Collect metadata
+        paths_arr = np.asarray(df_videos.path.map(lambda x: str(source_dir.joinpath(x))))
+        height_list = []
+        width_list = []
+        frames_list = []
+        with Pool() as pool:
+            for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos), step=batch_size), desc='Metadata'):
+                batch_res = pool.map(extract_meta_av, paths_arr[batch_idx0:batch_idx0 + batch_size])
+                for res in batch_res:
+                    height_list.append(res[0])
+                    width_list.append(res[1])
+                    frames_list.append(res[2])
+        df_videos['height'] = np.asarray(height_list, dtype=np.uint16)
+        df_videos['width'] = np.asarray(width_list, dtype=np.uint16)
+        df_videos['frames'] = np.asarray(frames_list, dtype=np.uint16)
+        print('Saving video DataFrame to {}'.format(videodataset_path))
+        df_videos.to_pickle(str(videodataset_path))
+    print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
+    print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
+if __name__ == '__main__':
+    main(sys.argv[1:])

models/icpr2020dfdc/index_ffpp.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""
+Index FaceForensics++
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import argparse
+import sys
+from multiprocessing import Pool
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from isplutils.utils import extract_meta_av, extract_meta_cv
+def parse_args(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source', type=Path, help='Source dir',
+                        default='dataset/ffpp/faceforensics')
+    parser.add_argument('--videodataset', type=Path, default='data/ffpp_videos.pkl',
+                        help='Path to save the videos DataFrame')
+    return parser.parse_args(argv)
+def main(argv):
+    ## Parameters parsing
+    args = parse_args(argv)
+    source_dir: Path = args.source
+    videodataset_path: Path = args.videodataset
+    # Create ouput folder (if doesn't exist)
+    videodataset_path.parent.mkdir(parents=True, exist_ok=True)
+    ## DataFrame
+    if videodataset_path.exists():
+        print('Loading video DataFrame')
+        df_videos = pd.read_pickle(videodataset_path)
+    else:
+        print('Creating video DataFrame')
+        ff_videos = Path(source_dir).rglob('*.mp4')
+        df_videos = pd.DataFrame(
+            {'path': [f.relative_to(source_dir) for f in ff_videos if 'mask' not in str(f) and 'raw' not in str(f)]})
+        df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
+        with Pool() as p:
+            meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
+        meta = np.stack(meta)
+        df_videos.loc[:, ['height', 'width', 'frames']] = meta
+        # Fix for videos that av cannot decode properly
+        for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
+            meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
+            df_videos.loc[idx, ['height', 'width', 'frames']] = meta
+        df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
+        df_videos['label'] = df_videos['class'].map(
+            lambda x: True if x == 'manipulated_sequences' else False)  # True is FAKE, False is REAL
+        df_videos['source'] = df_videos['path'].map(lambda x: x.parts[1]).astype('category')
+        df_videos['quality'] = df_videos['path'].map(lambda x: x.parts[2]).astype('category')
+        df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').parts[-1])
+        df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
+        df_videos.loc[(df_videos['label'] == True) & (df_videos['source'] != 'DeepFakeDetection'), 'original'] = \
+            df_videos[(df_videos['label'] == True) & (df_videos['source'] != 'DeepFakeDetection')]['name'].map(
+                lambda x: df_videos.index[np.flatnonzero(df_videos['name'] == x.split('_')[0])[0]]
+            )
+        df_videos.loc[(df_videos['label'] == True) & (df_videos['source'] == 'DeepFakeDetection'), 'original'] = \
+            df_videos[(df_videos['label'] == True) & (df_videos['source'] == 'DeepFakeDetection')]['name'].map(
+                lambda x: df_videos.index[
+                    np.flatnonzero(df_videos['name'] == x.split('_')[0] + '__' + x.split('__')[1])[0]]
+            )
+        print('Saving video DataFrame to {}'.format(videodataset_path))
+        df_videos.to_pickle(str(videodataset_path))
+    print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
+    print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
+if __name__ == '__main__':
+    main(sys.argv[1:])

models/icpr2020dfdc/isplutils/__init__.py ADDED Viewed

File without changes

models/icpr2020dfdc/isplutils/data.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import os
+from pathlib import Path
+from typing import List
+import albumentations as A
+import numpy as np
+import pandas as pd
+import torch
+from PIL import Image
+from albumentations.pytorch import ToTensorV2
+from torch.utils.data import Dataset, IterableDataset
+from .utils import extract_bb
+def load_face(record: pd.Series, root: str, size: int, scale: str, transformer: A.BasicTransform) -> torch.Tensor:
+    path = os.path.join(str(root), str(record.name))
+    autocache = size < 256 or scale == 'tight'
+    if scale in ['crop', 'scale', ]:
+        cached_path = str(Path(root).joinpath('autocache', scale, str(size), str(record.name)).with_suffix('.jpg'))
+    else:
+        # when self.scale == 'tight' the extracted face is not dependent on size
+        cached_path = str(Path(root).joinpath('autocache', scale, str(record.name)).with_suffix('.jpg'))
+    face = np.zeros((size, size, 3), dtype=np.uint8)
+    if os.path.exists(cached_path):
+        try:
+            face = Image.open(cached_path)
+            face = np.array(face)
+            if len(face.shape) != 3:
+                raise RuntimeError('Incorrect format: {}'.format(path))
+        except KeyboardInterrupt as e:
+            # We want keybord interrupts to be propagated
+            raise e
+        except (OSError, IOError) as e:
+            print('Deleting corrupted cache file: {}'.format(cached_path))
+            print(e)
+            os.unlink(cached_path)
+            face = np.zeros((size, size, 3), dtype=np.uint8)
+    if not os.path.exists(cached_path):
+        try:
+            frame = Image.open(path)
+            bb = record['left'], record['top'], record['right'], record['bottom']
+            face = extract_bb(frame, bb=bb, size=size, scale=scale)
+            if autocache:
+                os.makedirs(os.path.dirname(cached_path), exist_ok=True)
+                face.save(cached_path, quality=95, subsampling='4:4:4')
+            face = np.array(face)
+            if len(face.shape) != 3:
+                raise RuntimeError('Incorrect format: {}'.format(path))
+        except KeyboardInterrupt as e:
+            # We want keybord interrupts to be propagated
+            raise e
+        except (OSError, IOError) as e:
+            print('Error while reading: {}'.format(path))
+            print(e)
+            face = np.zeros((size, size, 3), dtype=np.uint8)
+    face = transformer(image=face)['image']
+    return face
+class FrameFaceIterableDataset(IterableDataset):
+    def __init__(self,
+                 roots: List[str],
+                 dfs: List[pd.DataFrame],
+                 size: int, scale: str,
+                 num_samples: int = -1,
+                 transformer: A.BasicTransform = ToTensorV2(),
+                 output_index: bool = False,
+                 labels_map: dict = None,
+                 seed: int = None):
+        """
+        :param roots: List of root folders for frames cache
+        :param dfs: List of DataFrames of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
+                   and 'label' column
+        :param size: face size
+        :param num_samples:
+        :param scale: Rescale the face to the given size, preserving the aspect ratio.
+                      If false crop around center to the given size
+        :param transformer:
+        :param output_index: enable output of df_frames index
+        :param labels_map: map from 'REAL' and 'FAKE' to actual labels
+        """
+        self.dfs = dfs
+        self.size = int(size)
+        self.seed0 = int(seed) if seed is not None else np.random.choice(2 ** 32)
+        # adapt indices
+        dfs_adapted = [df.copy() for df in self.dfs]
+        for df_idx, df in enumerate(dfs_adapted):
+            mi = pd.MultiIndex.from_tuples([(df_idx, key) for key in df.index], names=['df_idx', 'df_key'])
+            df.index = mi
+        # Concat
+        self.df = pd.concat(dfs_adapted, axis=0, join='inner')
+        self.df_real = self.df[self.df['label'] == 0]
+        self.df_fake = self.df[self.df['label'] == 1]
+        self.longer_set = 'real' if len(self.df_real) > len(self.df_fake) else 'fake'
+        self.num_samples = max(len(self.df_real), len(self.df_fake)) * 2
+        self.num_samples = min(self.num_samples, num_samples) if num_samples > 0 else self.num_samples
+        self.output_idx = bool(output_index)
+        self.scale = str(scale)
+        self.roots = [str(r) for r in roots]
+        self.transformer = transformer
+        self.labels_map = labels_map
+        if self.labels_map is None:
+            self.labels_map = {False: np.array([0., ]), True: np.array([1., ])}
+        else:
+            self.labels_map = dict(self.labels_map)
+    def _get_face(self, item: pd.Index) -> (torch.Tensor, torch.Tensor) or (torch.Tensor, torch.Tensor, str):
+        record = self.dfs[item[0]].loc[item[1]]
+        face = load_face(record=record,
+                         root=self.roots[item[0]],
+                         size=self.size,
+                         scale=self.scale,
+                         transformer=self.transformer)
+        label = self.labels_map[record.label]
+        if self.output_idx:
+            return face, label, record.name
+        else:
+            return face, label
+    def __len__(self):
+        return self.num_samples
+    def __iter__(self):
+        random_fake_idxs, random_real_idxs = get_iterative_real_fake_idxs(
+            df_real=self.df_real,
+            df_fake=self.df_fake,
+            num_samples=self.num_samples,
+            seed0=self.seed0
+        )
+        while len(random_fake_idxs) >= 1 and len(random_real_idxs) >= 1:
+            yield self._get_face(random_fake_idxs.pop())
+            yield self._get_face(random_real_idxs.pop())
+def get_iterative_real_fake_idxs(df_real: pd.DataFrame, df_fake: pd.DataFrame,
+                                 num_samples: int, seed0: int):
+    longer_set = 'real' if len(df_real) > len(df_fake) else 'fake'
+    worker_info = torch.utils.data.get_worker_info()
+    if worker_info is None:
+        seed = seed0
+        np.random.seed(seed)
+        worker_num_couple_samples = num_samples // 2
+        fake_idxs_portion = np.random.choice(df_fake.index, worker_num_couple_samples,
+                                             replace=longer_set == 'real')
+        real_idxs_portion = np.random.choice(df_real.index, worker_num_couple_samples,
+                                             replace=longer_set == 'fake')
+    else:
+        worker_id = worker_info.id
+        seed = seed0 + worker_id
+        np.random.seed(seed)
+        worker_num_couple_samples = (num_samples // 2) // worker_info.num_workers
+        if longer_set == 'fake':
+            fake_idxs_portion = df_fake.index[
+                                worker_id * worker_num_couple_samples:(worker_id + 1) * worker_num_couple_samples]
+            real_idxs_portion = np.random.choice(df_real.index, worker_num_couple_samples, replace=True)
+        else:
+            real_idxs_portion = df_real.index[
+                                worker_id * worker_num_couple_samples:(worker_id + 1) * worker_num_couple_samples]
+            fake_idxs_portion = np.random.choice(df_fake.index, worker_num_couple_samples,
+                                                 replace=True)
+    random_fake_idxs = list(np.random.permutation(fake_idxs_portion))
+    random_real_idxs = list(np.random.permutation(real_idxs_portion))
+    assert (len(random_fake_idxs) == len(random_real_idxs))
+    return random_fake_idxs, random_real_idxs
+class FrameFaceDatasetTest(Dataset):
+    def __init__(self, root: str, df: pd.DataFrame,
+                 size: int, scale: str,
+                 transformer: A.BasicTransform = ToTensorV2(),
+                 labels_map: dict = None,
+                 aug_transformers: List[A.BasicTransform] = None):
+        """
+        :param root: root folder for frames cache
+        :param df: DataFrame of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
+                   and 'label' column
+        :param size: face size
+        :param num_samples:
+        :param scale: Rescale the face to the given size, preserving the aspect ratio.
+                      If false crop around center to the given size
+        :param transformer:
+        :param labels_map: dcit to map df labels
+        :param aug_transformers: if not None, creates multiple copies of the same sample according to the provided augmentations
+        """
+        self.df = df
+        self.size = int(size)
+        self.scale = str(scale)
+        self.root = str(root)
+        self.transformer = transformer
+        self.aug_transformers = aug_transformers
+        self.labels_map = labels_map
+        if self.labels_map is None:
+            self.labels_map = {False: np.array([0., ]), True: np.array([1., ])}
+        else:
+            self.labels_map = dict(self.labels_map)
+    def _get_face(self, item: pd.Index) -> (torch.Tensor, torch.Tensor) or (torch.Tensor, torch.Tensor, str):
+        record = self.df.loc[item]
+        label = self.labels_map[record.label]
+        if self.aug_transformers is None:
+            face = load_face(record=record,
+                             root=self.root,
+                             size=self.size,
+                             scale=self.scale,
+                             transformer=self.transformer)
+            return face, label
+        else:
+            faces = []
+            for aug_transf in self.aug_transformers:
+                faces.append(
+                    load_face(record=record,
+                              root=self.root,
+                              size=self.size,
+                              scale=self.scale,
+                              transformer=A.Compose([aug_transf, self.transformer])
+                              ))
+            faces = torch.stack(faces)
+            return faces, label
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, item):
+        return self._get_face(self.df.index[item])

models/icpr2020dfdc/isplutils/data_siamese.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+from typing import List
+import albumentations as A
+import pandas as pd
+from albumentations.pytorch import ToTensorV2
+from .data import FrameFaceIterableDataset, get_iterative_real_fake_idxs
+class FrameFaceTripletIterableDataset(FrameFaceIterableDataset):
+    def __init__(self,
+                 roots: List[str],
+                 dfs: List[pd.DataFrame],
+                 size: int,
+                 scale: str,
+                 num_triplets: int = -1,
+                 transformer: A.BasicTransform = ToTensorV2(),
+                 seed: int = None):
+        """
+        :param roots: List of root folders for frames cache
+        :param dfs: List of DataFrames of cached frames with 'bb' column as array of 4 elements (left,top,right,bottom)
+                   and 'label' column
+        :param size: face size
+        :param num_triplets: number of samples for the dataset
+        :param idxs: sampling indexes triplets (each element is a key for anchor, positive, negative)
+        :param scale: Rescale the face to the given size, preserving the aspect ratio.
+                      If false crop around center to the given size
+        :param transformer:
+        :param seed:
+        """
+        super(FrameFaceTripletIterableDataset, self).__init__(
+            roots=roots,
+            dfs=dfs,
+            size=size,
+            scale=scale,
+            num_samples=num_triplets * 3,
+            transformer=transformer,
+            seed=seed
+        )
+        self.num_triplet_couples = self.num_samples // 6
+        self.num_triplets = self.num_triplet_couples * 2
+        self.num_samples = self.num_triplets * 3
+    def __len__(self):
+        return self.num_triplets
+    def __iter__(self):
+        random_fake_idxs, random_real_idxs = get_iterative_real_fake_idxs(
+            df_real=self.df_real,
+            df_fake=self.df_fake,
+            num_samples=self.num_samples,
+            seed0=self.seed0
+        )
+        while len(random_fake_idxs) >= 3 and len(random_real_idxs) >= 3:
+            a = self._get_face(random_fake_idxs.pop())[0]
+            p = self._get_face(random_fake_idxs.pop())[0]
+            n = self._get_face(random_real_idxs.pop())[0]
+            yield a, p, n
+            a = self._get_face(random_real_idxs.pop())[0]
+            p = self._get_face(random_real_idxs.pop())[0]
+            n = self._get_face(random_fake_idxs.pop())[0]
+            yield a, p, n

models/icpr2020dfdc/isplutils/split.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from typing import List, Dict, Tuple
+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+import numpy as np
+import pandas as pd
+available_datasets = [
+    'dfdc-35-5-10',
+    'ff-c23-720-140-140',
+    'ff-c23-720-140-140-5fpv',
+    'ff-c23-720-140-140-10fpv',
+    'ff-c23-720-140-140-15fpv',
+    'ff-c23-720-140-140-20fpv',
+    'ff-c23-720-140-140-25fpv',
+    'celebdf',  # just for convenience, not used in the original paper
+]
+def load_df(dfdc_df_path: str, ffpp_df_path: str, dfdc_faces_dir: str, ffpp_faces_dir: str, dataset: str) -> (pd.DataFrame, str):
+    if dataset.startswith('dfdc'):
+        df = pd.read_pickle(dfdc_df_path)
+        root = dfdc_faces_dir
+    elif dataset.startswith('ff-'):
+        df = pd.read_pickle(ffpp_df_path)
+        root = ffpp_faces_dir
+    else:
+        raise NotImplementedError('Unknown dataset: {}'.format(dataset))
+    return df, root
+def get_split_df(df: pd.DataFrame, dataset: str, split: str) -> pd.DataFrame:
+    if dataset == 'dfdc-35-5-10':
+        if split == 'train':
+            split_df = df[df['folder'].isin(range(35))]
+        elif split == 'val':
+            split_df = df[df['folder'].isin(range(35, 40))]
+        elif split == 'test':
+            split_df = df[df['folder'].isin(range(40, 50))]
+        else:
+            raise NotImplementedError('Unknown split: {}'.format(split))
+    elif dataset.startswith('ff-c23-720-140-140'):
+        # Save random state
+        st0 = np.random.get_state()
+        # Set seed for this selection only
+        np.random.seed(41)
+        # Split on original videos
+        crf = dataset.split('-')[1]
+        random_youtube_videos = np.random.permutation(
+            df[(df['source'] == 'youtube') & (df['quality'] == crf)]['video'].unique())
+        train_orig = random_youtube_videos[:720]
+        val_orig = random_youtube_videos[720:720 + 140]
+        test_orig = random_youtube_videos[720 + 140:]
+        if split == 'train':
+            split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
+        elif split == 'val':
+            split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
+        elif split == 'test':
+            split_df = pd.concat((df[df['original'].isin(test_orig)], df[df['video'].isin(test_orig)]), axis=0)
+        else:
+            raise NotImplementedError('Unknown split: {}'.format(split))
+        if dataset.endswith('fpv'):
+            fpv = int(dataset.rsplit('-', 1)[1][:-3])
+            idxs = []
+            for video in split_df['video'].unique():
+                idxs.append(np.random.choice(split_df[split_df['video'] == video].index, fpv, replace=False))
+            idxs = np.concatenate(idxs)
+            split_df = split_df.loc[idxs]
+        # Restore random state
+        np.random.set_state(st0)
+    elif dataset == 'celebdf':
+        seed = 41
+        num_real_train = 600
+        # Save random state
+        st0 = np.random.get_state()
+        # Set seed for this selection only
+        np.random.seed(seed)
+        # Split on original videos
+        random_train_val_real_videos = np.random.permutation(
+            df[(df['label'] == False) & (df['test'] == False)]['video'].unique())
+        train_orig = random_train_val_real_videos[:num_real_train]
+        val_orig = random_train_val_real_videos[num_real_train:]
+        if split == 'train':
+            split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
+        elif split == 'val':
+            split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
+        elif split == 'test':
+            split_df = df[df['test'] == True]
+        else:
+            raise NotImplementedError('Unknown split: {}'.format(split))
+        # Restore random state
+        np.random.set_state(st0)
+    else:
+        raise NotImplementedError('Unknown dataset: {}'.format(dataset))
+    return split_df
+def make_splits(dfdc_df: str, ffpp_df: str, dfdc_dir: str, ffpp_dir: str, dbs: Dict[str, List[str]]) -> Dict[str, Dict[str, Tuple[pd.DataFrame, str]]]:
+    """
+    Make split and return Dataframe and root
+    :param
+    dfdc_df: str, path to the DataFrame containing info on the faces extracted from the DFDC dataset with extract_faces.py
+    ffpp_df: str, path to the DataFrame containing info on the faces extracted from the FF++ dataset with extract_faces.py
+    dfdc_dir: str, path to the directory containing the faces extracted from the DFDC dataset with extract_faces.py
+    ffpp_dir: str, path to the directory containing the faces extracted from the FF++ dataset with extract_faces.py
+    dbs: {split_name:[split_dataset1,split_dataset2,...]}
+                Example:
+                {'train':['dfdc-35-5-15',],'val':['dfdc-35-5-15',]}
+    :return: split_dict: dictonary containing {split_name: ['train', 'val'], splitdb: List(pandas.DataFrame, str)}
+                Example:
+                {'train, 'dfdc-35-5-15': (dfdc_train_df, 'path/to/dir/of/DFDC/faces')}
+    """
+    split_dict = {}
+    full_dfs = {}
+    for split_name, split_dbs in dbs.items():
+        split_dict[split_name] = dict()
+        for split_db in split_dbs:
+            if split_db not in full_dfs:
+                full_dfs[split_db] = load_df(dfdc_df, ffpp_df, dfdc_dir, ffpp_dir, split_db)
+            full_df, root = full_dfs[split_db]
+            split_df = get_split_df(df=full_df, dataset=split_db, split=split_name)
+            split_dict[split_name][split_db] = (split_df, root)
+    return split_dict

models/icpr2020dfdc/isplutils/utils.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+Video Face Manipulation Detection Through Ensemble of CNNs
+Image and Sound Processing Lab - Politecnico di Milano
+Nicolò Bonettini
+Edoardo Daniele Cannas
+Sara Mandelli
+Luca Bondi
+Paolo Bestagini
+"""
+from pprint import pprint
+from typing import Iterable, List
+import albumentations as A
+import cv2
+import numpy as np
+import scipy
+import torch
+from PIL import Image
+from albumentations.pytorch import ToTensorV2
+from matplotlib import pyplot as plt
+from torch import nn as nn
+from torchvision import transforms
+def extract_meta_av(path: str) -> (int, int, int):
+    """
+    Extract video height, width and number of frames to index the files
+    :param path:
+    :return:
+    """
+    import av
+    try:
+        video = av.open(path)
+        video_stream = video.streams.video[0]
+        return video_stream.height, video_stream.width, video_stream.frames
+    except av.AVError as e:
+        print('Error while reading file: {}'.format(path))
+        print(e)
+        return 0, 0, 0
+    except IndexError as e:
+        print('Error while processing file: {}'.format(path))
+        print(e)
+        return 0, 0, 0
+def extract_meta_cv(path: str) -> (int, int, int):
+    """
+    Extract video height, width and number of frames to index the files
+    :param path:
+    :return:
+    """
+    try:
+        vid = cv2.VideoCapture(path)
+        num_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
+        height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
+        return height, width, num_frames
+    except Exception as e:
+        print('Error while reading file: {}'.format(path))
+        print(e)
+        return 0, 0, 0
+def adapt_bb(frame_height: int, frame_width: int, bb_height: int, bb_width: int, left: int, top: int, right: int,
+             bottom: int) -> (
+        int, int, int, int):
+    x_ctr = (left + right) // 2
+    y_ctr = (bottom + top) // 2
+    new_top = max(y_ctr - bb_height // 2, 0)
+    new_bottom = min(new_top + bb_height, frame_height)
+    new_left = max(x_ctr - bb_width // 2, 0)
+    new_right = min(new_left + bb_width, frame_width)
+    return new_left, new_top, new_right, new_bottom
+def extract_bb(frame: Image.Image, bb: Iterable, scale: str, size: int) -> Image.Image:
+    """
+    Extract a face from a frame according to the given bounding box and scale policy
+    :param frame: Entire frame
+    :param bb: Bounding box (left,top,right,bottom) in the reference system of the frame
+    :param scale: "scale" to crop a square with size equal to the maximum between height and width of the face, then scale to size
+                  "crop" to crop a fixed square around face center,
+                  "tight" to crop face exactly at the bounding box with no scaling
+    :param size: size of the face
+    :return:
+    """
+    left, top, right, bottom = bb
+    if scale == "scale":
+        bb_width = int(right) - int(left)
+        bb_height = int(bottom) - int(top)
+        bb_to_desired_ratio = min(size / bb_height, size / bb_width) if (bb_width > 0 and bb_height > 0) else 1.
+        bb_width = int(size / bb_to_desired_ratio)
+        bb_height = int(size / bb_to_desired_ratio)
+        left, top, right, bottom = adapt_bb(frame.height, frame.width, bb_height, bb_width, left, top, right,
+                                            bottom)
+        face = frame.crop((left, top, right, bottom)).resize((size, size), Image.BILINEAR)
+    elif scale == "crop":
+        # Find the center of the bounding box and cut an area around it of height x width
+        left, top, right, bottom = adapt_bb(frame.height, frame.width, size, size, left, top, right,
+                                            bottom)
+        face = frame.crop((left, top, right, bottom))
+    elif scale == "tight":
+        left, top, right, bottom = adapt_bb(frame.height, frame.width, bottom - top, right - left, left, top, right,
+                                            bottom)
+        face = frame.crop((left, top, right, bottom))
+    else:
+        raise ValueError('Unknown scale value: {}'.format(scale))
+    return face
+def showimage(img_tensor: torch.Tensor):
+    topil = transforms.Compose([
+        transforms.Normalize(mean=[0, 0, 0, ], std=[1 / 0.229, 1 / 0.224, 1 / 0.225]),
+        transforms.Normalize(mean=[-0.485, -0.456, -0.406], std=[1, 1, 1]),
+        transforms.ToPILImage()
+    ])
+    plt.figure()
+    plt.imshow(topil(img_tensor))
+    plt.show()
+def make_train_tag(net_class: nn.Module,
+                   face_policy: str,
+                   patch_size: int,
+                   traindb: List[str],
+                   seed: int,
+                   suffix: str,
+                   debug: bool,
+                   ):
+    # Training parameters and tag
+    tag_params = dict(net=net_class.__name__,
+                      traindb='-'.join(traindb),
+                      face=face_policy,
+                      size=patch_size,
+                      seed=seed
+                      )
+    print('Parameters')
+    pprint(tag_params)
+    tag = 'debug_' if debug else ''
+    tag += '_'.join(['-'.join([key, str(tag_params[key])]) for key in tag_params])
+    if suffix is not None:
+        tag += '_' + suffix
+    print('Tag: {:s}'.format(tag))
+    return tag
+def get_transformer(face_policy: str, patch_size: int, net_normalizer: transforms.Normalize, train: bool):
+    # Transformers and traindb
+    if face_policy == 'scale':
+        # The loader crops the face isotropically then scales to a square of size patch_size_load
+        loading_transformations = [
+            A.PadIfNeeded(min_height=patch_size, min_width=patch_size,
+                          border_mode=cv2.BORDER_CONSTANT, value=0,always_apply=True),
+            A.Resize(height=patch_size,width=patch_size,always_apply=True),
+        ]
+        if train:
+            downsample_train_transformations = [
+                A.Downscale(scale_max=0.5, scale_min=0.5, p=0.5),  # replaces scaled dataset
+            ]
+        else:
+            downsample_train_transformations = []
+    elif face_policy == 'tight':
+        # The loader crops the face tightly without any scaling
+        loading_transformations = [
+            A.LongestMaxSize(max_size=patch_size, always_apply=True),
+            A.PadIfNeeded(min_height=patch_size, min_width=patch_size,
+                          border_mode=cv2.BORDER_CONSTANT, value=0,always_apply=True),
+        ]
+        if train:
+            downsample_train_transformations = [
+                A.Downscale(scale_max=0.5, scale_min=0.5, p=0.5),  # replaces scaled dataset
+            ]
+        else:
+            downsample_train_transformations = []
+    else:
+        raise ValueError('Unknown value for face_policy: {}'.format(face_policy))
+    if train:
+        aug_transformations = [
+            A.Compose([
+                A.HorizontalFlip(),
+                A.OneOf([
+                    A.RandomBrightnessContrast(),
+                    A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=30, val_shift_limit=20),
+                ]),
+                A.OneOf([
+                    A.ISONoise(),
+                    A.IAAAdditiveGaussianNoise(scale=(0.01 * 255, 0.03 * 255)),
+                ]),
+                A.Downscale(scale_min=0.7, scale_max=0.9, interpolation=cv2.INTER_LINEAR),
+                A.ImageCompression(quality_lower=50, quality_upper=99),
+            ], )
+        ]
+    else:
+        aug_transformations = []
+    # Common final transformations
+    final_transformations = [
+        A.Normalize(mean=net_normalizer.mean, std=net_normalizer.std, ),
+        ToTensorV2(),
+    ]
+    transf = A.Compose(
+        loading_transformations + downsample_train_transformations + aug_transformations + final_transformations)
+    return transf
+def aggregate(x, deadzone: float, pre_mult: float, policy: str, post_mult: float, clipmargin: float, params={}):
+    x = x.copy()
+    if deadzone > 0:
+        x = x[(x > deadzone) | (x < -deadzone)]
+        if len(x) == 0:
+            x = np.asarray([0, ])
+    if policy == 'mean':
+        x = np.mean(x)
+        x = scipy.special.expit(x * pre_mult)
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'sigmean':
+        x = scipy.special.expit(x * pre_mult).mean()
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'meanp':
+        pow_coeff = params.pop('p', 3)
+        x = np.mean(np.sign(x) * (np.abs(x) ** pow_coeff))
+        x = np.sign(x) * (np.abs(x) ** (1 / pow_coeff))
+        x = scipy.special.expit(x * pre_mult)
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'median':
+        x = scipy.special.expit(np.median(x) * pre_mult)
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'sigmedian':
+        x = np.median(scipy.special.expit(x * pre_mult))
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'maxabs':
+        x = np.min(x) if abs(np.min(x)) > abs(np.max(x)) else np.max(x)
+        x = scipy.special.expit(x * pre_mult)
+        x = (x - 0.5) * post_mult + 0.5
+    elif policy == 'avgvoting':
+        x = np.mean(np.sign(x))
+        x = (x * post_mult + 1) / 2
+    elif policy == 'voting':
+        x = np.sign(np.mean(x * pre_mult))
+        x = (x - 0.5) * post_mult + 0.5
+    else:
+        raise NotImplementedError()
+    return np.clip(x, clipmargin, 1 - clipmargin)

models/icpr2020dfdc/notebook/Analyze results net fusion paper.ipynb ADDED Viewed