DimasMP3 commited on Feb 4

Commit

5c69097

verified ·

1 Parent(s): deba98e

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
facelandmarker/face_landmarker.task +3 -0
insightface/.gitattributes +30 -0
insightface/README.md +22 -0
insightface/models/arcface_r100_v1/model-0000.params +3 -0
insightface/models/arcface_r100_v1/model-symbol.json +0 -0
insightface/models/buffalo_l/1k3d68.onnx +3 -0
insightface/models/buffalo_l/2d106det.onnx +3 -0
insightface/models/buffalo_l/det_10g.onnx +3 -0
insightface/models/buffalo_l/genderage.onnx +3 -0
insightface/models/buffalo_l/w600k_r50.onnx +3 -0
insightface/models/genderage_v1/model-0000.params +3 -0
insightface/models/genderage_v1/model-symbol.json +2399 -0
insightface/models/retinaface_r50_v1/R50-0000.params +3 -0
insightface/models/retinaface_r50_v1/R50-symbol.json +0 -0
insightface/models/scrfd_10g/model.pth +3 -0
insightface/models/scrfd_10g_bnkps/model.pth +3 -0
insightface/models/scrfd_1g/model.pth +3 -0
insightface/models/scrfd_2.5g/model.pth +3 -0
insightface/models/scrfd_2.5g_bnkps/model.pth +3 -0
insightface/models/scrfd_34g/model.pth +3 -0
insightface/models/scrfd_500m/model.pth +3 -0
insightface/models/scrfd_500m_bnkps/model.pth +3 -0
insightface/models/scrfd_person_2.5g.onnx +3 -0
insightface/models/synthetic_resnet50d.ckpt +3 -0
talknet-asd/.dockerignore +20 -0
talknet-asd/.gitignore +118 -0
talknet-asd/FAQ.md +54 -0
talknet-asd/LICENSE.md +21 -0
talknet-asd/README.md +146 -0
talknet-asd/TalkSet/README.md +48 -0
talknet-asd/TalkSet/generate_TalkSet.py +391 -0
talknet-asd/awesomeASD.md +38 -0
talknet-asd/cog.yaml +40 -0
talknet-asd/dataLoader.py +143 -0
talknet-asd/demoTalkNet.py +686 -0
talknet-asd/export_onnx_cpu.py +87 -0
talknet-asd/loss.py +50 -0
talknet-asd/model/attentionLayer.py +36 -0
talknet-asd/model/audioEncoder.py +108 -0
talknet-asd/model/faceDetector/README.md +3 -0
talknet-asd/model/faceDetector/__init__.py +1 -0
talknet-asd/model/faceDetector/s3fd/__init__.py +66 -0
talknet-asd/model/faceDetector/s3fd/box_utils.py +217 -0
talknet-asd/model/faceDetector/s3fd/nets.py +174 -0
talknet-asd/model/talkNetModel.py +64 -0
talknet-asd/model/visualEncoder.py +172 -0
talknet-asd/predict.py +201 -0
talknet-asd/sanity_check.ipynb +0 -0
talknet-asd/talkNet.py +94 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+facelandmarker/face_landmarker.task filter=lfs diff=lfs merge=lfs -text
+insightface/models/arcface_r100_v1/model-0000.params filter=lfs diff=lfs merge=lfs -text
+insightface/models/genderage_v1/model-0000.params filter=lfs diff=lfs merge=lfs -text
+insightface/models/retinaface_r50_v1/R50-0000.params filter=lfs diff=lfs merge=lfs -text
+talknet-asd/utils/overall.png filter=lfs diff=lfs merge=lfs -text
+yolo-face-person-detector/images/image.png filter=lfs diff=lfs merge=lfs -text
+yolo-face-person-detector/images/output.mp4 filter=lfs diff=lfs merge=lfs -text

facelandmarker/face_landmarker.task ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
+size 3758596

insightface/.gitattributes ADDED Viewed

	@@ -0,0 +1,30 @@

+*.params filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

insightface/README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+# insightface
+- https://github.com/deepinsight/insightface
+    - SCRFD
+        - https://github.com/deepinsight/insightface/tree/master/detection/scrfd
+            - https://1drv.ms/u/s!AswpsDO2toNKqyYWxScdiTITY4TQ?e=DjXof9
+            - https://1drv.ms/u/s!AswpsDO2toNKqyPVLI44ahNBsOMR?e=esPrBL
+            - https://1drv.ms/u/s!AswpsDO2toNKqyTIXnzB1ujPq4th?e=5t1VNv
+            - https://1drv.ms/u/s!AswpsDO2toNKqyUKwTiwXv2kaa8o?e=umfepO
+            - https://1drv.ms/u/s!AswpsDO2toNKqyKZwFebVlmlOvzz?e=V2rqUy
+            - https://1drv.ms/u/s!AswpsDO2toNKri_NDM0GIkPpkE2f?e=JkebJo
+            - https://1drv.ms/u/s!AswpsDO2toNKqyGlhxnCg3smyQqX?e=A6Hufm
+            - https://1drv.ms/u/s!AswpsDO2toNKqyGlhxnCg3smyQqX?e=A6Hufm
+    - Person Detection
+        - https://github.com/deepinsight/insightface/tree/master/examples/person_detection
+            - https://github.com/deepinsight/insightface/releases/download/v0.7/scrfd_person_2.5g.onnx
+    - Face Alignment (FaceSynthetics)
+        - https://github.com/deepinsight/insightface/tree/master/alignment/synthetics
+            - https://drive.google.com/file/d/1kNP7qEl3AYNbaHFUg_ZiyRB1CtfDWXR4/view?usp=sharing
+    - buffalo_l
+        - https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip

insightface/models/arcface_r100_v1/model-0000.params ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:931257c0b7174254fd81314706f2591cc6d1dd7299275bb8cf01c774ed0da8be
+size 260958682

insightface/models/arcface_r100_v1/model-symbol.json ADDED Viewed

The diff for this file is too large to render. See raw diff

insightface/models/buffalo_l/1k3d68.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df5c06b8a0c12e422b2ed8947b8869faa4105387f199c477af038aa01f9a45cc
+size 143607619

insightface/models/buffalo_l/2d106det.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f001b856447c413801ef5c42091ed0cd516fcd21f2d6b79635b1e733a7109dbf
+size 5030888

insightface/models/buffalo_l/det_10g.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5838f7fe053675b1c7a08b633df49e7af5495cee0493c7dcf6697200b85b5b91
+size 16923827

insightface/models/buffalo_l/genderage.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fde69b1c810857b88c64a335084f1c3fe8f01246c9a191b48c7bb756d6652fb
+size 1322532

insightface/models/buffalo_l/w600k_r50.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c06341c33c2ca1f86781dab0e829f88ad5b64be9fba56e56bc9ebdefc619e43
+size 174383860

insightface/models/genderage_v1/model-0000.params ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01788b7eaa2516636cbd976fad7883164aaeb0bd4027e878ff457f79fe9021aa
+size 1100856

insightface/models/genderage_v1/model-symbol.json ADDED Viewed

	@@ -0,0 +1,2399 @@

+{
+  "nodes": [
+    {
+      "op": "null",
+      "name": "data",
+      "inputs": []
+    },
+    {
+      "op": "_minus_scalar",
+      "name": "_minusscalar0",
+      "attrs": {"scalar": "127.5"},
+      "inputs": [[0, 0, 0]]
+    },
+    {
+      "op": "_mul_scalar",
+      "name": "_mulscalar0",
+      "attrs": {"scalar": "0.0078125"},
+      "inputs": [[1, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_1_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "8",
+        "num_group": "1",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_1_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "8",
+        "num_group": "1",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[2, 0, 0], [3, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_1_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_1_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_1_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_1_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_1_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 1], [8, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_1_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[9, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_2_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "8",
+        "num_group": "8",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_2_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "8",
+        "num_group": "8",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[10, 0, 0], [11, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_2_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_2_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_2_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_2_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_2_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[12, 0, 0], [13, 0, 0], [14, 0, 0], [15, 0, 1], [16, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_2_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[17, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_2_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "16",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_2_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "16",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[18, 0, 0], [19, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_2_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_2_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_2_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_2_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_2_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[20, 0, 0], [21, 0, 0], [22, 0, 0], [23, 0, 1], [24, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_2_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[25, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_3_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "16",
+        "num_group": "16",
+        "pad": "(1, 1)",
+        "stride": "(2, 2)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_3_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "16",
+        "num_group": "16",
+        "pad": "(1, 1)",
+        "stride": "(2, 2)"
+      },
+      "inputs": [[26, 0, 0], [27, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_3_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_3_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_3_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_3_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_3_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[28, 0, 0], [29, 0, 0], [30, 0, 0], [31, 0, 1], [32, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_3_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[33, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_3_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "32",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_3_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "32",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[34, 0, 0], [35, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_3_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_3_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_3_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_3_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_3_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[36, 0, 0], [37, 0, 0], [38, 0, 0], [39, 0, 1], [40, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_3_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[41, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_4_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "32",
+        "num_group": "32",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_4_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "32",
+        "num_group": "32",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[42, 0, 0], [43, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_4_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_4_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_4_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_4_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_4_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[44, 0, 0], [45, 0, 0], [46, 0, 0], [47, 0, 1], [48, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_4_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[49, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_4_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "32",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_4_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "32",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[50, 0, 0], [51, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_4_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_4_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_4_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_4_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_4_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[52, 0, 0], [53, 0, 0], [54, 0, 0], [55, 0, 1], [56, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_4_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[57, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_5_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "32",
+        "num_group": "32",
+        "pad": "(1, 1)",
+        "stride": "(2, 2)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_5_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "32",
+        "num_group": "32",
+        "pad": "(1, 1)",
+        "stride": "(2, 2)"
+      },
+      "inputs": [[58, 0, 0], [59, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_5_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_5_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_5_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_5_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_5_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[60, 0, 0], [61, 0, 0], [62, 0, 0], [63, 0, 1], [64, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_5_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[65, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_5_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "64",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_5_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "64",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[66, 0, 0], [67, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_5_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_5_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_5_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_5_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_5_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[68, 0, 0], [69, 0, 0], [70, 0, 0], [71, 0, 1], [72, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_5_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[73, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_6_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "64",
+        "num_group": "64",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_6_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "64",
+        "num_group": "64",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[74, 0, 0], [75, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_6_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_6_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_6_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_6_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_6_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[76, 0, 0], [77, 0, 0], [78, 0, 0], [79, 0, 1], [80, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_6_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[81, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_6_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "64",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_6_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "64",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[82, 0, 0], [83, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_6_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_6_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_6_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_6_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_6_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[84, 0, 0], [85, 0, 0], [86, 0, 0], [87, 0, 1], [88, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_6_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[89, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_7_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "64",
+        "num_group": "64",
+        "pad": "(1, 1)",
+        "stride": "(2, 2)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_7_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "64",
+        "num_group": "64",
+        "pad": "(1, 1)",
+        "stride": "(2, 2)"
+      },
+      "inputs": [[90, 0, 0], [91, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_7_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_7_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_7_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_7_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_7_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[92, 0, 0], [93, 0, 0], [94, 0, 0], [95, 0, 1], [96, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_7_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[97, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_7_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_7_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[98, 0, 0], [99, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_7_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_7_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_7_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_7_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_7_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[100, 0, 0], [101, 0, 0], [102, 0, 0], [103, 0, 1], [104, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_7_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[105, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_8_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_8_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[106, 0, 0], [107, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_8_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_8_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_8_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_8_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_8_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[108, 0, 0], [109, 0, 0], [110, 0, 0], [111, 0, 1], [112, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_8_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[113, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_8_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_8_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[114, 0, 0], [115, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_8_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_8_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_8_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_8_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_8_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[116, 0, 0], [117, 0, 0], [118, 0, 0], [119, 0, 1], [120, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_8_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[121, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_9_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_9_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[122, 0, 0], [123, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_9_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_9_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_9_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_9_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_9_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[124, 0, 0], [125, 0, 0], [126, 0, 0], [127, 0, 1], [128, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_9_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[129, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_9_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_9_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[130, 0, 0], [131, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_9_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_9_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_9_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_9_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_9_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[132, 0, 0], [133, 0, 0], [134, 0, 0], [135, 0, 1], [136, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_9_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[137, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_10_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_10_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[138, 0, 0], [139, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_10_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_10_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_10_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_10_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_10_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[140, 0, 0], [141, 0, 0], [142, 0, 0], [143, 0, 1], [144, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_10_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[145, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_10_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_10_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[146, 0, 0], [147, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_10_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_10_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_10_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_10_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_10_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[148, 0, 0], [149, 0, 0], [150, 0, 0], [151, 0, 1], [152, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_10_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[153, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_11_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_11_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[154, 0, 0], [155, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_11_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_11_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_11_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_11_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_11_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[156, 0, 0], [157, 0, 0], [158, 0, 0], [159, 0, 1], [160, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_11_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[161, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_11_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_11_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[162, 0, 0], [163, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_11_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_11_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_11_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_11_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_11_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[164, 0, 0], [165, 0, 0], [166, 0, 0], [167, 0, 1], [168, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_11_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[169, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_12_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_12_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[170, 0, 0], [171, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_12_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_12_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_12_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_12_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_12_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[172, 0, 0], [173, 0, 0], [174, 0, 0], [175, 0, 1], [176, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_12_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[177, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_12_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_12_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[178, 0, 0], [179, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_12_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_12_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_12_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_12_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_12_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[180, 0, 0], [181, 0, 0], [182, 0, 0], [183, 0, 1], [184, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_12_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[185, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_13_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(2, 2)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_13_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "128",
+        "num_group": "128",
+        "pad": "(1, 1)",
+        "stride": "(2, 2)"
+      },
+      "inputs": [[186, 0, 0], [187, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_13_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_13_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_13_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_13_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_13_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[188, 0, 0], [189, 0, 0], [190, 0, 0], [191, 0, 1], [192, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_13_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[193, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_13_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "256",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_13_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "256",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[194, 0, 0], [195, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_13_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_13_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_13_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_13_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_13_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[196, 0, 0], [197, 0, 0], [198, 0, 0], [199, 0, 1], [200, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_13_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[201, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_14_dw_conv2d_weight",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "256",
+        "num_group": "256",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_14_dw_conv2d",
+      "attrs": {
+        "kernel": "(3, 3)",
+        "no_bias": "True",
+        "num_filter": "256",
+        "num_group": "256",
+        "pad": "(1, 1)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[202, 0, 0], [203, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_14_dw_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_14_dw_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_14_dw_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_14_dw_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_14_dw_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[204, 0, 0], [205, 0, 0], [206, 0, 0], [207, 0, 1], [208, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_14_dw_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[209, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_14_conv2d_weight",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "256",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": []
+    },
+    {
+      "op": "Convolution",
+      "name": "conv_14_conv2d",
+      "attrs": {
+        "kernel": "(1, 1)",
+        "no_bias": "True",
+        "num_filter": "256",
+        "num_group": "1",
+        "pad": "(0, 0)",
+        "stride": "(1, 1)"
+      },
+      "inputs": [[210, 0, 0], [211, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "conv_14_batchnorm_gamma",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_14_batchnorm_beta",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_14_batchnorm_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "conv_14_batchnorm_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "fix_gamma": "True"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "conv_14_batchnorm",
+      "attrs": {"fix_gamma": "True"},
+      "inputs": [[212, 0, 0], [213, 0, 0], [214, 0, 0], [215, 0, 1], [216, 0, 1]]
+    },
+    {
+      "op": "Activation",
+      "name": "conv_14_relu",
+      "attrs": {"act_type": "relu"},
+      "inputs": [[217, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "bn1_gamma",
+      "attrs": {
+        "eps": "2e-05",
+        "fix_gamma": "False",
+        "momentum": "0.9"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "bn1_beta",
+      "attrs": {
+        "eps": "2e-05",
+        "fix_gamma": "False",
+        "momentum": "0.9"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "bn1_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "eps": "2e-05",
+        "fix_gamma": "False",
+        "momentum": "0.9"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "bn1_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "eps": "2e-05",
+        "fix_gamma": "False",
+        "momentum": "0.9"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "bn1",
+      "attrs": {
+        "eps": "2e-05",
+        "fix_gamma": "False",
+        "momentum": "0.9"
+      },
+      "inputs": [[218, 0, 0], [219, 0, 0], [220, 0, 0], [221, 0, 1], [222, 0, 1]]
+    },
+    {
+      "op": "null",
+      "name": "relu1_gamma",
+      "attrs": {
+        "__init__": "[\"Constant\", {\"value\": 0.25}]",
+        "act_type": "prelu"
+      },
+      "inputs": []
+    },
+    {
+      "op": "LeakyReLU",
+      "name": "relu1",
+      "attrs": {"act_type": "prelu"},
+      "inputs": [[223, 0, 0], [224, 0, 0]]
+    },
+    {
+      "op": "Pooling",
+      "name": "pool1",
+      "attrs": {
+        "global_pool": "True",
+        "kernel": "(7, 7)",
+        "pool_type": "avg"
+      },
+      "inputs": [[225, 0, 0]]
+    },
+    {
+      "op": "Flatten",
+      "name": "flatten0",
+      "inputs": [[226, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "pre_fc1_weight",
+      "attrs": {"num_hidden": "202"},
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "pre_fc1_bias",
+      "attrs": {"num_hidden": "202"},
+      "inputs": []
+    },
+    {
+      "op": "FullyConnected",
+      "name": "pre_fc1",
+      "attrs": {"num_hidden": "202"},
+      "inputs": [[227, 0, 0], [228, 0, 0], [229, 0, 0]]
+    },
+    {
+      "op": "null",
+      "name": "fc1_gamma",
+      "attrs": {
+        "eps": "2e-05",
+        "fix_gamma": "True",
+        "momentum": "0.9"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "fc1_beta",
+      "attrs": {
+        "eps": "2e-05",
+        "fix_gamma": "True",
+        "momentum": "0.9"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "fc1_moving_mean",
+      "attrs": {
+        "__init__": "[\"zero\", {}]",
+        "eps": "2e-05",
+        "fix_gamma": "True",
+        "momentum": "0.9"
+      },
+      "inputs": []
+    },
+    {
+      "op": "null",
+      "name": "fc1_moving_var",
+      "attrs": {
+        "__init__": "[\"one\", {}]",
+        "eps": "2e-05",
+        "fix_gamma": "True",
+        "momentum": "0.9"
+      },
+      "inputs": []
+    },
+    {
+      "op": "BatchNorm",
+      "name": "fc1",
+      "attrs": {
+        "eps": "2e-05",
+        "fix_gamma": "True",
+        "momentum": "0.9"
+      },
+      "inputs": [[230, 0, 0], [231, 0, 0], [232, 0, 0], [233, 0, 1], [234, 0, 1]]
+    }
+  ],
+  "arg_nodes": [
+    0,
+    3,
+    5,
+    6,
+    7,
+    8,
+    11,
+    13,
+    14,
+    15,
+    16,
+    19,
+    21,
+    22,
+    23,
+    24,
+    27,
+    29,
+    30,
+    31,
+    32,
+    35,
+    37,
+    38,
+    39,
+    40,
+    43,
+    45,
+    46,
+    47,
+    48,
+    51,
+    53,
+    54,
+    55,
+    56,
+    59,
+    61,
+    62,
+    63,
+    64,
+    67,
+    69,
+    70,
+    71,
+    72,
+    75,
+    77,
+    78,
+    79,
+    80,
+    83,
+    85,
+    86,
+    87,
+    88,
+    91,
+    93,
+    94,
+    95,
+    96,
+    99,
+    101,
+    102,
+    103,
+    104,
+    107,
+    109,
+    110,
+    111,
+    112,
+    115,
+    117,
+    118,
+    119,
+    120,
+    123,
+    125,
+    126,
+    127,
+    128,
+    131,
+    133,
+    134,
+    135,
+    136,
+    139,
+    141,
+    142,
+    143,
+    144,
+    147,
+    149,
+    150,
+    151,
+    152,
+    155,
+    157,
+    158,
+    159,
+    160,
+    163,
+    165,
+    166,
+    167,
+    168,
+    171,
+    173,
+    174,
+    175,
+    176,
+    179,
+    181,
+    182,
+    183,
+    184,
+    187,
+    189,
+    190,
+    191,
+    192,
+    195,
+    197,
+    198,
+    199,
+    200,
+    203,
+    205,
+    206,
+    207,
+    208,
+    211,
+    213,
+    214,
+    215,
+    216,
+    219,
+    220,
+    221,
+    222,
+    224,
+    228,
+    229,
+    231,
+    232,
+    233,
+    234
+  ],
+  "node_row_ptr": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    32,
+    33,
+    34,
+    35,
+    36,
+    37,
+    38,
+    39,
+    42,
+    43,
+    44,
+    45,
+    46,
+    47,
+    48,
+    49,
+    52,
+    53,
+    54,
+    55,
+    56,
+    57,
+    58,
+    59,
+    62,
+    63,
+    64,
+    65,
+    66,
+    67,
+    68,
+    69,
+    72,
+    73,
+    74,
+    75,
+    76,
+    77,
+    78,
+    79,
+    82,
+    83,
+    84,
+    85,
+    86,
+    87,
+    88,
+    89,
+    92,
+    93,
+    94,
+    95,
+    96,
+    97,
+    98,
+    99,
+    102,
+    103,
+    104,
+    105,
+    106,
+    107,
+    108,
+    109,
+    112,
+    113,
+    114,
+    115,
+    116,
+    117,
+    118,
+    119,
+    122,
+    123,
+    124,
+    125,
+    126,
+    127,
+    128,
+    129,
+    132,
+    133,
+    134,
+    135,
+    136,
+    137,
+    138,
+    139,
+    142,
+    143,
+    144,
+    145,
+    146,
+    147,
+    148,
+    149,
+    152,
+    153,
+    154,
+    155,
+    156,
+    157,
+    158,
+    159,
+    162,
+    163,
+    164,
+    165,
+    166,
+    167,
+    168,
+    169,
+    172,
+    173,
+    174,
+    175,
+    176,
+    177,
+    178,
+    179,
+    182,
+    183,
+    184,
+    185,
+    186,
+    187,
+    188,
+    189,
+    192,
+    193,
+    194,
+    195,
+    196,
+    197,
+    198,
+    199,
+    202,
+    203,
+    204,
+    205,
+    206,
+    207,
+    208,
+    209,
+    212,
+    213,
+    214,
+    215,
+    216,
+    217,
+    218,
+    219,
+    222,
+    223,
+    224,
+    225,
+    226,
+    227,
+    228,
+    229,
+    232,
+    233,
+    234,
+    235,
+    236,
+    237,
+    238,
+    239,
+    242,
+    243,
+    244,
+    245,
+    246,
+    247,
+    248,
+    249,
+    252,
+    253,
+    254,
+    255,
+    256,
+    257,
+    258,
+    259,
+    262,
+    263,
+    264,
+    265,
+    266,
+    267,
+    268,
+    269,
+    272,
+    273,
+    274,
+    275,
+    276,
+    277,
+    280,
+    281,
+    282,
+    283,
+    284,
+    285,
+    286,
+    287,
+    288,
+    289,
+    290,
+    291,
+    294
+  ],
+  "heads": [[235, 0, 0]],
+  "attrs": {"mxnet_version": ["int", 10300]}
+}

insightface/models/retinaface_r50_v1/R50-0000.params ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20818d53adcefea4d3c4f31ba555910b9d052836588607af50af28cb414cb31e
+size 118010124

insightface/models/retinaface_r50_v1/R50-symbol.json ADDED Viewed

The diff for this file is too large to render. See raw diff

insightface/models/scrfd_10g/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:963570df5e0ebf6bb313239d0f9f3f0c096c1ff6937e8e28e45abad4d8b1d5c7
+size 15545065

insightface/models/scrfd_10g_bnkps/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d431436577d01c827abd78aa40c782b8fb318c26555ac60582144aaf66867411
+size 17005828

insightface/models/scrfd_1g/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7d7d654c992c1581270461466a52c876234ad8be0ad8de37b9782d9f03beb86
+size 2647067

insightface/models/scrfd_2.5g/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbe1d35ac6e0859307067bc3ccd44973b536b451437d23547fc460a05d00993f
+size 2781443

insightface/models/scrfd_2.5g_bnkps/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3db3b99c09e9212e9f2bb3970f6e641ec1812f27b19753f68289326067209662
+size 3346972

insightface/models/scrfd_34g/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6f69956639da31c96d8985c9a0ce1f5798f42cb64909159596e7a5f544ebe00
+size 39677731

insightface/models/scrfd_500m/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1043ab96cff67ee8ebb5fc2819f23f3620a128d133f5b5234cd2aedeeb83b5f0
+size 2404021

insightface/models/scrfd_500m_bnkps/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b04315df8db019067edacaceb73484e531981442f321432b8bf003e9812d6b3d
+size 2669108

insightface/models/scrfd_person_2.5g.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76522ba15eecb0712780509e912884aba066e9834be0c85761918cdcf76de5b5
+size 3710223

insightface/models/synthetic_resnet50d.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01b3d5533999da3e605e5b9d99fb0a2a55e634467346c7504e3fbf778cfb219e
+size 190838028

talknet-asd/.dockerignore ADDED Viewed

	@@ -0,0 +1,20 @@

+# The .dockerignore file excludes files from the container build process.
+#
+# https://docs.docker.com/engine/reference/builder/#dockerignore-file
+# Cog
+/demo/*
+# Exclude Git files
+.git
+.github
+.gitignore
+# Exclude Python cache files
+__pycache__
+.mypy_cache
+.pytest_cache
+.ruff_cache
+# Exclude Python virtual environment
+/venv

talknet-asd/.gitignore ADDED Viewed

	@@ -0,0 +1,118 @@

+# Other files
+*.model
+*.pth
+*.wav
+*.mp4
+*.txt
+*.pcm
+*.avi
+data/
+tests/
+exps/
+/demo/*
+.cog
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/

talknet-asd/FAQ.md ADDED Viewed

	@@ -0,0 +1,54 @@

+## 1. General Question
+### 1.1 Which dataset is used for training and testing ?
+'pretrain_AVA.model' is trained on the AVA training set and evaluate on the AVA val/test set, (Has the entire code)
+'pretrain_TalkSet.model' is trained on our TalkSet and evaluate on the Columbia ASD set or other raw videos.
+### 1.2 How to figure the variable length of data during training ?
+We design a scheme to feed the variable-length data into the same mini-batch: we sort all videos by their length and put the videos with similar length into the same batch. We crop all videos into the minimum number of frames in this batch. In this way, we train the TalkNet with videos of different length without losing too much data.
+### 1.3 How to figure multiple faces on the screen ?
+In the ASD task, when there are multiple face tracks in the video, we consider one track at a time. The face track of interest is given in each test trial. You can also consider the relationship between the faces on the screen at the same time. There are some papers about that.
+### 1.4 Error: RuntimeError: CUDA error: no kernel image is available for execution on the device
+Do "pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U", check this [page](https://github.com/pytorch/pytorch/issues/31285#issuecomment-739139454).
+### 1.5 Can not download csv, video data or pretrain model ?
+I use google drive to upload the pretrain model and csv files. So you need to make sure you can use google drive under your internet. The error during extract video clips can be ignored.
+***
+## 2. TalkNet in AVA-Activespeaker dataset
+### 2.1 Can not reimplement the result ?
+In our experiments, for the result in AVA validation set, for the same code/model, the best training result is 92.6mAP, the worst one is 92.1mAP. So it is reasonable if you get the result little different than this 92.3mAP. Also batchsize might effect the result (not too much).
+### 2.2 How to get the result in AVA test set ?
+AVA test set did not release the labels. So you need to upload your csv result in their system. Notice that we delete add the first line in the `test_res.csv` file since we modify a bit for the `get_ava_active_speaker_performance.py`. You need to delete the first line when you upload it. For the upload file, you need to set all `label` as `SPEAKING_AUDIBLE`.
+### 2.3 What are the labels ? Where is SPEAKING_BUT_NOT_AUDIBLE ？
+There are three labels: SPEAKING_AND_AUDIBLE, SPEAKING_BUT_NOT_AUDIBLE, NOT_SPEAKING, but in the finally evaluation, SPEAKING_BUT_NOT_AUDIBLE and NOT_SPEAKING share the same label. So this is a binary classification issue.
+### 2.4 How big your model ? How long for training?
+Our model has 15.01M params, in one 22G GPU, each epoch we train 15 mins, evaluate in val set takes 5 mins. Train 25 epochs can get the best result. So at most 7 hours.
+***
+## 3. TalkNet in TalkSet and Columbia ASD dataset
+### 3.1 Why you build TalkSet instead of only use AVA dataset ?
+Because we want our model can be used for all videos in the wild. AVA dataset has already provide the face bounding box for each trial, so for the videos not in AVA. If you want to do ASD, you need to do face detection first. In our experiments, the face detection method used in AVA is hard to reimplement. Which means we can hardly get the face area that similar to the detected face in AVA. Due to that, the model trained in AVA can not perform well in videos outside AVA if we use other face detection method.
+Due to that, we build TalkSet, the face in TalkSet is all detected by S3FD. So for any raw video (Such as the videos in Col ASD dataset), we can use S3FD to do face detection first, then apply our TalkNet model to get the ASD label.
+### 3.2 TalkSet code can not work?
+We did not verify this code. Because we just modify LRS3 and VoxCeleb2 to build this set, so we do not (or cannot) upload this set. This `generate_TalkSet.py` is what we used to generate this dataset, and we did not check it later. So it just used for your reference. We have already provide the data list, so you can generate this dataset based on it.
+***
+## 4. An ASD Demo with pretrained TalkNet model
+### 4.1 I try the demo, the performance is not so good ?
+You can check the demo video `001.mp4` first and compare your output and our result `001_res.mp4` to make sure what you did is correct. So if you are not statisfied with the performance. We are sorry about that (~cry). We think this model can further improve. For the very short clips (less than 1s), small face and side face, the performance is not so good.
+### 4.2 I try the demo, the face can not be detected ?
+That is the reason for the face detection model instead of the ASD part. You can use better face detecion model such as [Insightface](https://github.com/deepinsight/insightface
+). Only when the face can be detected, ASD model can perform to get the ASD labels.

talknet-asd/LICENSE.md ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Tao Ruijie
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

talknet-asd/README.md ADDED Viewed

	@@ -0,0 +1,146 @@

+## Is someone talking? TalkNet: Audio-visual active speaker detection Model
+This repository contains the code for our ACM MM 2021 paper (oral), TalkNet, an active speaker detection model to detect 'whether the face in the screen is speaking or not?'. [[Paper](https://arxiv.org/pdf/2107.06592.pdf)]    [[Video_English](https://youtu.be/C6bpAgI9zxE)]    [[Video_Chinese](https://www.bilibili.com/video/bv1Yw411d7HG)].
+### Updates:
+A new [demo page](https://www.sievedata.com/functions/sieve/talknet-asd). Thanks the contribution from [mvoodarla](https://github.com/mvoodarla) !
+![overall.png](utils/overall.png)
+- [**Awesome ASD**](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/awesomeASD.md): Papers about active speaker detection in last years.
+- **TalkNet in AVA-Activespeaker dataset**: The code to preprocess the AVA-ActiveSpeaker dataset, train TalkNet in AVA train set and evaluate it in AVA val/test set.
+- **TalkNet in TalkSet and Columbia ASD dataset**: The code to generate TalkSet, an ASD dataset in the wild, based on VoxCeleb2 and LRS3, train TalkNet in TalkSet and evaluate it in Columnbia ASD dataset.
+- **An ASD Demo with pretrained TalkNet model**: An end-to-end script to detect and mark the speaking face by the pretrained TalkNet model.
+***
+### Dependencies
+Start from building the environment
+```
+conda create -n TalkNet python=3.7.9 anaconda
+conda activate TalkNet
+pip install -r requirement.txt
+```
+Start from the existing environment
+```
+pip install -r requirement.txt
+```
+***
+## TalkNet in AVA-Activespeaker dataset
+#### Data preparation
+The following script can be used to download and prepare the AVA dataset for training.
+```
+python trainTalkNet.py --dataPathAVA AVADataPath --download
+```
+`AVADataPath` is the folder you want to save the AVA dataset and its preprocessing outputs, the details can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/utils/tools.py#L34) . Please read them carefully.
+#### Training
+Then you can train TalkNet in AVA end-to-end by using:
+```
+python trainTalkNet.py --dataPathAVA AVADataPath
+```
+`exps/exps1/score.txt`: output score file, `exps/exp1/model/model_00xx.model`: trained model, `exps/exps1/val_res.csv`: prediction for val set.
+#### Pretrained model
+Our pretrained model performs `mAP: 92.3` in validation set, you can check it by using:
+```
+python trainTalkNet.py --dataPathAVA AVADataPath --evaluation
+```
+The pretrained model will automaticly be downloaded into `TalkNet_ASD/pretrain_AVA.model`. It performs `mAP: 90.8` in the testing set.
+***
+## TalkNet in TalkSet and Columbia ASD dataset
+#### Data preparation
+We find that it is challenge to apply the model we trained in AVA for the videos not in AVA (Reason is [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/FAQ.md), Q3.1). So we build TalkSet, an active speaker detection dataset in the wild, based on `VoxCeleb2` and `LRS3`.
+We do not plan to upload this dataset since we just modify it, instead of building it. In `TalkSet` folder we provide these `.txt` files to describe which files we used to generate the TalkSet and their ASD labels. You can generate this `TalkSet` if you are interested to train an ASD model in the wild.
+Also, we have provided our pretrained TalkNet model in TalkSet. You can evaluate it in Columbia ASD dataset or other raw videos in the wild.
+#### Usage
+A pretrain model in TalkSet will be download into `TalkNet_ASD/pretrain_TalkSet.model` when using the following script:
+```
+python demoTalkNet.py --evalCol --colSavePath colDataPath
+```
+Also, Columnbia ASD dataset and the labels will be downloaded into `colDataPath`. Finally you can get the following F1 result.
+| Name |  Bell  |  Boll  |  Lieb  |  Long  |  Sick  |  Avg.  |
+|----- | ------ | ------ | ------ | ------ | ------ | ------ |
+|  F1  |  98.1  |  88.8  |  98.7  |  98.0  |  97.7  |  96.3  |
+(This result is different from that in our paper because we train the model again, while the avg. F1 is very similar)
+***
+## An ASD Demo with pretrained TalkNet model
+#### Data preparation
+We build an end-to-end script to detect and extract the active speaker from the raw video by our pretrain model in TalkSet.
+You can put the raw video (`.mp4` and `.avi` are both fine) into the `demo` folder, such as `001.mp4`.
+#### Usage
+```
+python demoTalkNet.py --videoName 001
+```
+A pretrain model in TalkSet will be downloaded into `TalkNet_ASD/pretrain_TalkSet.model`. The structure of the output reults can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/demoTalkNet.py#L351).
+You can get the output video `demo/001/pyavi/video_out.avi`, which has marked the active speaker by green box and non-active speaker by red box.
+If you want to evaluate by using cpu only, you can modify `demoTalkNet.py` and `talkNet.py` file: modify all `cuda` into `cpu`. Then replace line 83 in talkNet.py into `loadedState = torch.load(path,map_location=torch.device('cpu'))`
+***
+### Citation
+Please cite the following if our paper or code is helpful to your research.
+```
+@inproceedings{tao2021someone,
+  title={Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual Active Speaker Detection},
+  author={Tao, Ruijie and Pan, Zexu and Das, Rohan Kumar and Qian, Xinyuan and Shou, Mike Zheng and Li, Haizhou},
+  booktitle = {Proceedings of the 29th ACM International Conference on Multimedia},
+  pages = {3927–3935},
+  year={2021}
+}
+```
+I have summaried some potential [FAQs](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/FAQ.md). You can also check the `issues` in Github for other questions that I have answered.
+This is my first open-source work, please let me know if I can future improve in this repositories or there is anything wrong in our work. Thanks for your support!
+### Acknowledge
+We study many useful projects in our codeing process, which includes:
+The structure of the project layout and the audio encoder is learnt from this [repository](https://github.com/clovaai/voxceleb_trainer).
+Demo for visulization is modified from this [repository](https://github.com/joonson/syncnet_python).
+AVA data download code is learnt from this [repository](https://github.com/fuankarion/active-speakers-context).
+The model for the visual frontend is learnt from this [repository](https://github.com/lordmartian/deep_avsr).
+Thanks for these authors to open source their code!
+### Cooperation
+If you are interested to work on this topic and have some ideas to implement, I am glad to collaborate and contribute with my experiences & knowlegde in this topic. Please contact me with ruijie.tao@u.nus.edu.

talknet-asd/TalkSet/README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+### TalkSet Generation
+You can check the 'train.txt' and 'test.txt' to generate TalkSet by your own.
+This `generate_TalkSet.py` code is just used for your reference, I did not check it recently.
+Input the LRS3, VoxCeleb2, 3 list files in `lists_in`
+Output TalkSet, train.txt, test.txt (Here the test set is the validation set actually)
+### Usage:
+Set the following parser based on the location of your data:
+`out_path`: the output TalkSet location
+`Vox_audio`: Location of the Vox2, training set, audio location
+`Vox_video`: Location of the Vox2, training set, video location
+`lrs3_audio`: Location of the LRS3, audio location
+`lrs3_video`: Location of the LRS3, video location
+`task`: The part of the TalkSet you want to generate, eg: TAudio
+`num_cpu`: The num of the threads, higher will be faster, based on your PC performance, eg: 10
+```
+python TalkSet/generate_TalkSet.py --task 'TAudio'
+python TalkSet/generate_TalkSet.py --task 'FAudio'
+python TalkSet/generate_TalkSet.py --task 'TFAudio'
+python TalkSet/generate_TalkSet.py --task 'TSilence'
+python TalkSet/generate_TalkSet.py --task 'FSilence'
+python TalkSet/generate_TalkSet.py --task 'Fusion'
+```
+### Description:
+For `lists_out\*.txt` files:
+- The 1st row is the face clips data type,
+	- TAudio: audio is active, lip is moving, audio and lip are sync
+	- FAudio: audio is active, lip is moving, audio and lip are not sync (Speech from others)
+	- TFAudio: one part is 'TAudio', the other part is 'FAudio'
+	- TSilence: one part is 'TAudio', in the other part, audio is non-active, lip is not moving
+	- FSilence: one part is 'silence'(audio is non-active, lip is not moving), in the other part, audio is active, lip is not moving (Speech from others)
+- The 2nd row is the path for the audio file (filename started from 'silence' is the data from LRS3, filename started from 'id.....' is the data from VoxCeleb2)
+- The 3rd row is the path for the video file
+- The 4th row is the length(seconds) of this data
+- The 5th row is the start of 'active' clip (in FSilence, it presents the 'silence' part)
+- The 6th row is the end of 'active' clip
+- The 7th row is the start of 'non-active' clip (in FSilence, it presents the 'speech from others' part)
+- The 8th row is the end of 'non-active' clip
+- The 9th row is the file ID
+The dataset generated will not be fixed each time because we randomly select FSlience data, and the change point is the random number. We believe the result will be similar. The whole time to generate the TalkSet will use about 3 to 6 hours in our experiments.

talknet-asd/TalkSet/generate_TalkSet.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import os, glob, subprocess, argparse, sys, numpy, random, math, cv2
+from itertools import repeat
+from multiprocessing import Pool
+from scipy.io import wavfile
+from pydub import AudioSegment
+from tqdm import tqdm
+def get_length(input_video):
+	result = subprocess.run(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', input_video], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+	return float(result.stdout)
+def read_Vox_lines(file):
+	Tlines, Flines = [], []
+	with open(file) as f_in:
+		while True:
+			line = f_in.readline()
+			if not line:
+				break
+			if int(line[0]):
+				Tlines.append(line)
+			else:
+				Flines.append(line)
+	return Tlines, Flines
+def read_LRS3_ST(file):
+	lines = []
+	with open(file) as f_in:
+		while True:
+			line = f_in.readline()
+			if not line:
+				break
+			lines.append(line)
+	return lines[:30000]
+def read_LRS3_S(file):
+	lines = []
+	with open(file) as f_in:
+		while True:
+			line = f_in.readline()
+			if not line:
+				break
+			start = int(line.split()[1]) / 100
+			end = int(line.split()[2]) / 100
+			if end - start <= 3: # Only select less than 3s
+				lines.append(line)
+	return lines[:30000]
+def generate_TAudio(line, args):
+	# Get the id of the audio and video
+	audio_name = line.split()[1][:-4]
+	video_name = line.split()[2][:-4]
+	id1 = audio_name.split('/')[0]
+	name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
+	name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
+	name = name1 + '_' + name2
+	audio_path = os.path.join(args.Vox_audio, audio_name + '.wav')
+	video_path = os.path.join(args.Vox_video, video_name + '.mp4')
+	out_audio_path = os.path.join(args.out_path, 'TAudio', id1 + '/' + name + '.wav')
+	out_video_path = os.path.join(args.out_path, 'TAudio', id1 + '/' + name + '.mp4')
+	os.makedirs(os.path.join(args.out_path, 'TAudio', id1), exist_ok = True)
+	# Read the audio data and the length of audio and video
+	audio = AudioSegment.from_file(audio_path, format="wav")
+	length_audio = len(audio) / 1000.0
+	length_video = get_length(video_path)
+	length_data = int(min(length_video, length_audio) * 100) / 100
+	audio = audio[:int(length_data * 1000)]
+	# Extract the video and audio
+	start = 0
+	end = length_data
+	audio.export(out_audio_path, format="wav")
+	cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
+	subprocess.call(cmd, shell=True, stdout=None)
+	# # Write the txt file
+	start_T, end_T = 0, length_data
+	start_F, end_F= 0, 0
+	line_new = "TAudio" +  ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
+	+ ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
+	return line_new
+def generate_FAudio(line, args):
+	# Get the id of the audio and video
+	audio_name = line.split()[1][:-4]
+	video_name = line.split()[2][:-4]
+	id1 = audio_name.split('/')[0]
+	name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
+	name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
+	name = name1 + '_' + name2
+	audio_path = os.path.join(args.Vox_audio, audio_name + '.wav')
+	video_path = os.path.join(args.Vox_video, video_name + '.mp4')
+	out_audio_path = os.path.join(args.out_path, 'FAudio', id1 + '/' + name + '.wav')
+	out_video_path = os.path.join(args.out_path, 'FAudio', id1 + '/' + name + '.mp4')
+	os.makedirs(os.path.join(args.out_path, 'FAudio', id1), exist_ok = True)
+	# Read the audio data and the length of audio and video
+	audio = AudioSegment.from_file(audio_path, format="wav")
+	length_audio = len(audio) / 1000.0
+	length_video = get_length(video_path)
+	length_data = int(min(length_video, length_audio) * 100) / 100
+	audio = audio[:int(length_data * 1000)]
+	# Extract the video and audio
+	start = 0
+	end = length_data
+	audio.export(out_audio_path, format="wav")
+	cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
+	subprocess.call(cmd, shell=True, stdout=None)
+	# Write the txt file
+	start_T, end_T = 0, 0
+	start_F, end_F= 0, length_data
+	line_new = "FAudio" +  ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
+	+ ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
+	return line_new
+def generate_TFAudio(line, args):
+	# Get the id of the audio and video
+	audio_name = line.split()[1][:-4]
+	video_name = line.split()[2][:-4]
+	id1 = audio_name.split('/')[0]
+	name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
+	name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
+	name = name1 + '_' + name2
+	audio_T_path = os.path.join(args.Vox_audio, video_name + '.wav')
+	audio_F_path = os.path.join(args.Vox_audio, audio_name + '.wav')
+	video_path = os.path.join(args.Vox_video, video_name + '.mp4')
+	out_audio_path = os.path.join(args.out_path, 'TFAudio', id1 + '/' + name + '.wav')
+	out_video_path = os.path.join(args.out_path, 'TFAudio', id1 + '/' + name + '.mp4')
+	os.makedirs(os.path.join(args.out_path, 'TFAudio', id1), exist_ok = True)
+	# Read the audio data and the length of audio and video
+	audio_T = AudioSegment.from_file(audio_T_path, format="wav")
+	audio_F = AudioSegment.from_file(audio_F_path, format="wav")
+	length_audio_T = len(audio_T) / 1000.0
+	length_audio_F = len(audio_F) / 1000.0
+	length_video = get_length(video_path)
+	length_data = int(min(length_audio_T, length_audio_F, length_video) * 100) / 100
+	audio_T = audio_T[:int(length_data * 1000)]
+	audio_F = audio_F[:int(length_data * 1000)]
+	# Generate the audio
+	changepoint = int((length_data * 0.25 + length_data * random.random() * 0.5) * 100) / 100
+	audio_dict = {}
+	audio_dict['T1'] = audio_T[:changepoint * 1000]
+	audio_dict['T2'] = audio_T[changepoint * 1000:]
+	audio_dict['F1'] = audio_F[:changepoint * 1000]
+	audio_dict['F2'] = audio_F[changepoint * 1000:]
+	seed = random.randint(0,1)
+	if seed == 1:
+		audio = audio_dict['T1'] + audio_dict['F2']
+	else:
+		audio = audio_dict['F1'] + audio_dict['T2']
+	# Extract the video and audio
+	start = 0
+	end = length_data
+	audio.export(out_audio_path, format="wav")
+	cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
+	subprocess.call(cmd, shell=True, stdout=None)
+	# Write the txt file
+	if seed == 1:
+		start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
+	elif seed == 0:
+		start_F, end_F, start_T, end_T = 0, changepoint, changepoint, length_data
+	line_new = "TFAudio" +  ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
+	+ ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
+	return line_new
+def generate_TSilence(line, args):
+	# Get the id of the audio and video
+	type_change = line.split()[0]
+	audio_name = line.split()[1]
+	video_name = line.split()[1]
+	id1 = audio_name.split('/')[0]
+	name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + line.split()[5]
+	name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + line.split()[5]
+	name = name1 + '_' + name2
+	start = int(line.split()[2]) / 100
+	mid = int(line.split()[3]) / 100
+	end = int(line.split()[4]) / 100
+	audio_path = os.path.join(args.lrs3_audio, 'pretrain', audio_name[8:] + '.wav')
+	video_path = os.path.join(args.lrs3_video, 'pretrain', video_name[8:]+ '.mp4')
+	out_audio_path = os.path.join(args.out_path, 'TSilence', id1 + '/' + name + '.wav')
+	out_video_path = os.path.join(args.out_path, 'TSilence', id1 + '/' + name + '.mp4')
+	os.makedirs(os.path.join(os.path.join(args.out_path, 'TSilence'), id1), exist_ok = True)
+	# Read the audio data and the length of audio and video
+	audio = AudioSegment.from_file(audio_path, format="wav")
+	# Get the required audio and video data
+	length_data = int((end - start) * 100) / 100
+	audio = audio[int(start * 1000):int(end * 1000)]
+	audio.export(out_audio_path, format="wav")
+	cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
+	subprocess.call(cmd, shell=True, stdout=None)
+	changepoint = int((mid - start) * 100) / 100
+	if type_change == "10":
+		start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
+	elif type_change == "01":
+		start_T, end_T, start_F, end_F = changepoint, length_data, 0,  changepoint
+	audio_name = audio_name[:-5] + line.split()[5]
+	video_name = video_name[:-5] + line.split()[5]
+	line_new = "TSilence" +  ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
+	+ ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
+	return line_new
+def generate_FSilence(line, Flines, args):
+	# Get the id of the audio and video
+	audio_T_name = line.split()[0]
+	video_name = line.split()[0]
+	start = int(line.split()[1]) / 100
+	end = int(line.split()[2]) / 100
+	length_data = int((end - start) * 100) / 100
+	changepoint = int((length_data * 0.25 + length_data * random.random() * 0.5) * 100) / 100
+	speech_line = random.choice(Flines)
+	length_speech = float(speech_line.split()[-1])
+	while length_speech < length_data:
+		speech_line = random.choice(Flines)
+		length_speech = float(speech_line.split()[-1])
+	audio_F_name = speech_line.split()[1][:-4]
+	id1 = audio_F_name.split('/')[0]
+	name1 = audio_F_name.split('/')[0] + '_' + audio_F_name.split('/')[1] + '_' + audio_F_name.split('/')[2]
+	name2 = audio_T_name.split('/')[0] + '_' + audio_T_name.split('/')[1] + '_' + line.split()[-1]
+	name = name1 + '_' + name2
+	# True: orig_video False: speech+slience
+	video_path = os.path.join(args.lrs3_video, 'pretrain', video_name[8:]+ '.mp4')
+	audio_T_path = os.path.join(args.lrs3_audio, 'pretrain', audio_T_name[8:] + '.wav')
+	audio_F_path = os.path.join(args.Vox_audio, audio_F_name + '.wav')
+	out_audio_path = os.path.join(args.out_path, 'FSilence', id1 + '/' + name + '.wav')
+	out_video_path = os.path.join(args.out_path, 'FSilence', id1 + '/' + name + '.mp4')
+	os.makedirs(os.path.join(args.out_path, 'FSilence', id1), exist_ok = True)
+	# Read the audio data and the length of audio and video
+	audio_T = AudioSegment.from_file(audio_T_path, format="wav")
+	audio_T = audio_T[int(start * 1000):int(end * 1000)]
+	audio_F = AudioSegment.from_file(audio_F_path, format="wav")
+	length_audio_T = len(audio_T) / 1000.0
+	length_audio_F = len(audio_F) / 1000.0
+	length_video = get_length(video_path)
+	length_data = int(min(length_audio_T, length_audio_F, length_video) * 100) / 100
+	audio_T = audio_T[:int(length_data * 1000)]
+	audio_F = audio_F[:int(length_data * 1000)]
+	# Generate the audio
+	audio_dict = {}
+	audio_dict['T1'] = audio_T[:changepoint * 1000]
+	audio_dict['T2'] = audio_T[changepoint * 1000:]
+	audio_dict['F1'] = audio_F[:changepoint * 1000]
+	audio_dict['F2'] = audio_F[changepoint * 1000:]
+	seed = random.randint(0,1)
+	if seed == 1:
+		audio = audio_dict['T1'] + audio_dict['F2']
+	else:
+		audio = audio_dict['F1'] + audio_dict['T2']
+	# Extract the video and audio
+	audio.export(out_audio_path, format="wav")
+	cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
+	subprocess.call(cmd, shell=True, stdout=None)
+	# Write the txt file
+	if seed == 1:
+		start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
+	elif seed == 0:
+		start_F, end_F, start_T, end_T = 0, changepoint, changepoint, length_data
+	video_name = video_name[:-5] + line.split()[-1]
+	line_new = "FSilence" +  ' ' + str(audio_F_name) + ' ' + str(video_name) + ' ' + str(length_data) \
+	+ ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
+	return line_new
+# MAIN
+parser = argparse.ArgumentParser(description = "generate_Dataset")
+parser.add_argument('--List_folder',   type=str, default= 'lists')
+parser.add_argument('--out_path',   type=str, default= '/data07/ruijie/database/TalkSet_final')
+parser.add_argument('--Vox_audio',   type=str, default= '/home/ruijie/database/VoxCeleb2/audio/audio_clean/clean/train')
+parser.add_argument('--Vox_video',   type=str, default= '/home/ruijie/database/VoxCeleb2/video/orig/train')
+parser.add_argument('--lrs3_audio', type=str, default='/data07/ruijie/database/LRS3/audio/orig_audio/clean')
+parser.add_argument('--lrs3_video', type=str, default='/data07/ruijie/database/LRS3/video/orig_video')
+parser.add_argument('--task',   type=str, default='TAudio')
+parser.add_argument('--num_cpu',   type=int, default=10)
+args = parser.parse_args()
+os.makedirs(os.path.join(args.out_path, 'TAudio'), exist_ok = True)
+os.makedirs(os.path.join(args.out_path, 'FAudio'), exist_ok = True)
+os.makedirs(os.path.join(args.out_path, 'TFAudio'), exist_ok = True)
+os.makedirs(os.path.join(args.out_path, 'FSilence'), exist_ok = True)
+os.makedirs(os.path.join(args.out_path, 'TSilence'), exist_ok = True)
+args.list_Vox = os.path.join(args.List_folder, 'lists_in', 'Vox_list.txt')
+args.list_LRS3_S = os.path.join(args.List_folder, 'lists_in', 'LRS3_S_list.txt')
+args.list_LRS3_ST = os.path.join(args.List_folder, 'lists_in', 'LRS3_ST_list.txt')
+args.list_out = os.path.join(args.List_folder, 'lists_out')
+args.list_out_train = os.path.join(args.list_out, 'train.txt')
+args.list_out_test = os.path.join(args.list_out, 'test.txt')
+if args.task == 'TAudio':
+	Tlines, _ = read_Vox_lines(args.list_Vox)
+	Tlines_new = []
+	# Generate the video and audio
+	with Pool(args.num_cpu) as p:
+		Tlines_new.append(p.starmap(generate_TAudio, zip(Tlines, repeat(args))))
+	# Write the txt file
+	out_Tlist_file = open(os.path.join(args.list_out, 'TAudio.txt'), "w")
+	for line_new in Tlines_new[0]:
+		out_Tlist_file.write(line_new)
+	print('TAudio Finish')
+if args.task == 'FAudio':
+	_, Flines = read_Vox_lines(args.list_Vox)
+	Flines_new = []
+	# Generate the video and audio
+	with Pool(args.num_cpu) as p:
+		Flines_new.append(p.starmap(generate_FAudio, zip(Flines, repeat(args))))
+	# Write the txt file
+	out_Flist_file = open(os.path.join(args.list_out, 'FAudio.txt'), "w")
+	for line_new in Flines_new[0]:
+		out_Flist_file.write(line_new)
+	print('FAudio Finish')
+if args.task == 'TFAudio':
+	_, Flines = read_Vox_lines(args.list_Vox)
+	TFlines_new = []
+	# Generate the video and audio
+	with Pool(args.num_cpu) as p:
+		TFlines_new.append(p.starmap(generate_TFAudio, zip(Flines, repeat(args))))
+	# Write the txt file
+	out_TFlist_file = open(os.path.join(args.list_out, 'TFAudio.txt'), "w")
+	for line_new in TFlines_new[0]:
+		out_TFlist_file.write(line_new)
+	print('TFAudio Finish')
+if args.task == 'TSilence':
+	Slines = read_LRS3_ST(args.list_LRS3_ST)
+	TSlines_new = []
+	with Pool(args.num_cpu) as p:
+		TSlines_new.append(p.starmap(generate_TSilence, zip(Slines, repeat(args))))
+	# Write the txt file
+	out_TSlist_file = open(os.path.join(args.list_out, 'TSilence.txt'), "w")
+	for line_new in TSlines_new[0]:
+		out_TSlist_file.write(line_new)
+	print('TSilence Finish')
+if args.task == 'FSilence':
+	Tlines, _ = read_Vox_lines(args.list_Vox)
+	Slines = read_LRS3_S(args.list_LRS3_S)
+	FSlines_new = []
+	with Pool(args.num_cpu) as p:
+		FSlines_new.append(p.starmap(generate_FSilence, zip(Slines, repeat(Tlines), repeat(args))))
+	out_FSlist_file = open(os.path.join(args.list_out, 'FSilence.txt'), "w")
+	for line_new in FSlines_new[0]:
+		out_FSlist_file.write(line_new)
+	print('FSilence Finish')
+if args.task == 'Fusion':
+	lines = []
+	for name in {'TAudio', 'FAudio', 'TFAudio', 'TSilence', 'FSilence'}:
+		with open(args.list_out + '/' + name + '.txt') as f:
+			while True:
+				line = f.readline()
+				if not line:
+					break
+				lines.append(line)
+	train_file = open(args.list_out_train, "w")
+	test_file = open(args.list_out_test, "w")
+	random.shuffle(lines)
+	for num, line in enumerate(lines):
+		data = line.split()
+		if float(data[3]) > 6: # For the data longer than 6s, we cut them into 6s, so that will make training process simple.
+			line = str(data[0]) +  ' ' + str(data[1]) + ' ' + str(data[2]) + ' ' + \
+				   str(min(float(data[3]), 6)) + ' ' + str(min(float(data[4]), 6)) + ' ' + \
+				   str(min(float(data[5]), 6)) + ' ' + str(min(float(data[6]), 6)) + ' ' + \
+				   str(min(float(data[7]), 6)) + ' ' + "%06d"%int(num) + '\n'
+		else:
+			line = str(data[0]) +  ' ' + str(data[1]) + ' ' + str(data[2]) + ' ' + \
+				   str(data[3]) + ' ' + str(data[4]) + ' ' + \
+				   str(data[5]) + ' ' + str(data[6]) + ' ' + \
+				   str(data[7]) + ' ' + "%06d"%int(num) + '\n'
+		if num % 30000 < 27000:
+			train_file.write(line)
+		else:
+			test_file.write(line)

talknet-asd/awesomeASD.md ADDED Viewed

	@@ -0,0 +1,38 @@

+## Related Work for Active Speaker Detection
+---
+### Research Paper In **AVA-ActiveSpeaker Dataset**
+- Roth J, Chaudhuri S, Klejch O, et al. Ava active speaker: [An audio-visual dataset for active speaker detection](https://arxiv.org/pdf/1901.01342.pdf), ICASSP, 2020.
+- Sharma R, Somandepalli K, Narayanan S. [Crossmodal learning for audio-visual speech event localization](https://arxiv.org/pdf/2003.04358.pdf), arXiv preprint, 2020.
+- Alcázar J L, Caba F, Mai L, et al. [Active speakers in context](https://openaccess.thecvf.com/content_CVPR_2020/papers/Alcazar_Active_Speakers_in_Context_CVPR_2020_paper.pdf)  , CVPR, 2020.
+- León-Alcázar J, Heilbron F C, Thabet A, et al. [MAAS: Multi-modal Assignation for Active Speaker Detection](https://arxiv.org/pdf/2101.03682.pdf), arXiv preprint, 2021.
+- Huang C, Koishida K. [Improved Active Speaker Detection based on Optical Flow](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w56/Huang_Improved_Active_Speaker_Detection_Based_on_Optical_Flow_CVPRW_2020_paper.pdf), CVPR Workshops, 2020
+- Assunção G, Gonçalves N, Menezes P. [Bio-Inspired Modality Fusion for Active Speaker Detection](https://www.mdpi.com/2076-3417/11/8/3397/pdf), Applied Sciences, 2021
+- Pouthier B, Pilati L, Gudupudi L K, et al. [Active Speaker Detection as a Multi-Objective Optimization with Uncertainty-based Multimodal Fusion](https://arxiv.org/pdf/2106.03821.pdf), arXiv preprint, 2021
+- Köpüklü O, Taseska M, Rigoll G. [How to Design a Three-Stage Architecture for Audio-Visual Active Speaker Detection in the Wild](https://arxiv.org/pdf/2106.03932.pdf), arVix preprint, 2021
+- Ruijie Tao, Zexu Pan, Rohan Kumar Das, Xinyuan Qian, Mike Zheng Shou, Haizhou Li. [Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual Active Speaker Detection](https://arxiv.org/pdf/2107.06592.pdf), ACM Multimedia (MM), 2021
+- Yuanhang Zhang, Susan Liang, Shuang Yang, Xiao Liu, Zhongqin Wu, Shiguang Shan, Xilin Chen. [UniCon: Unified Context Network for Robust Active Speaker
+Detection](https://arxiv.org/pdf/2108.02607.pdf), ACM Multimedia (MM), 2021
+### Research Report In **AVA-ActiveSpeaker Dataset for AVA-Activity Challenge**
+- Chung J S. [Naver at ActivityNet Challenge 2019--Task B Active Speaker Detection (AVA)](https://arxiv.org/pdf/1906.10555.pdf), 2019.
+- Zhang Y H, Xiao J, Yang S, et al. [Multi-Task Learning for Audio-Visual Active Speaker Detection](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2019/Multi_Task_Learning_for_Audio_Visual_Active_Speaker_Detection.pdf), 2019
+- Alcázar J L, Caba F, Mai L, et al. [Universidad de los Andes at ActivityNet Challenge 2020 - Task B Active Speaker
+Detection (AVA)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2020/ASC_AN_report.pdf), 2020
+- Köpüklü O, Taseska M, Rigoll G. [ASDNet at ActivityNet Challenge 2021-Active Speaker Detection (AVA)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S2_ActivityNet_Report_ASDNet.pdf), 2021
+- Zhang Y, Liang S, Yang S, et al. [ICTCAS-UCAS-TAL Submission to the AVA-ActiveSpeaker Task at ActivityNet Challenge 2021](http://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S1_ICTCAS-UCAS-TAL.pdf), 2021
+- Tao R, Pan Z, Das R K, et al. [NUS-HLT Report for ActivityNet Challenge 2021 AVA (Speaker)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S3_NUS_Report_AVA_ActiveSpeaker_2021.pdf), 2021
+### Research Paper In **Columnbia Active Speaker Detection Dataset**
+- Chakravarty P, Tuytelaars T. [Cross-modal supervision for learning active speaker detection in video](https://arxiv.org/pdf/1603.08907.pdf), ECCV, 2016
+- Chung J S, Zisserman A. [Out of time: automated lip sync in the wild](https://www.robots.ox.ac.uk/~vgg/publications/2016/Chung16a/chung16a.pdf), ECCV, 2016
+- Shahid M, Beyan C, Murino V. [Voice activity detection by upper body motion analysis and unsupervised domain adaptation](https://openaccess.thecvf.com/content_ICCVW_2019/papers/HBU/Shahid_Voice_Activity_Detection_by_Upper_Body_Motion_Analysis_and_Unsupervised_ICCVW_2019_paper.pdf), ICCV Workshops, 2019
+- Afouras T, Owens A, Chung J S, et al. [Self-supervised learning of audio-visual objects from video](https://arxiv.org/pdf/2008.04237.pdf), ECCV, 2020
+- Shahid M, Beyan C, Murino V. [Comparisons of visual activity primitives for voice activity detection](https://www.researchgate.net/profile/Cigdem-Beyan/publication/335604556_Comparisons_of_Visual_Activity_Primitives_for_Voice_Activity_Detection/links/5fa19074a6fdccfd7b97c0f5/Comparisons-of-Visual-Activity-Primitives-for-Voice-Activity-Detection.pdf), ICIAP, 2019
+- Shahid M, Beyan C, Murino V. [S-VVAD: Visual Voice Activity Detection by Motion](https://www.researchgate.net/profile/Cigdem-Beyan/publication/348279893_S-VVAD_Visual_Voice_Activity_Detection_by_Motion_Segmentation/links/5ff60482299bf14088786cc1/S-VVAD-Visual-Voice-Activity-Detection-by-Motion-Segmentation.pdf), WACV, 2021
+- Beyan C, Shahid M, Murino V. [RealVAD: A real-world dataset and a method for voice activity detection by body motion analysis](https://ieeexplore.ieee.org/document/9133504), IEEE Transactions on Multimedia, 2020.
+### Other Paper for Active Speaker Detection
+- Kim You Jin and Heo Hee-Soo, Soyeon Choe, et al. [Look Who’s Talking: Active Speaker Detection in the Wild](https://arxiv.org/pdf/2108.07640.pdf), Interspeech, 2021

talknet-asd/cog.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+  # a list of ubuntu apt packages to install
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "ffmpeg"
+  #   - "libglib2.0-0"
+  # python version in the form '3.11' or '3.11.4'
+  python_version: "3.8"
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    - "torch>=1.6.0"
+    - "torchaudio>=0.6.0"
+    - "numpy"
+    - "scipy"
+    - "scikit-learn"
+    - "tqdm"
+    - "scenedetect"
+    - "opencv-python"
+    - "python_speech_features"
+    - "torchvision"
+    - "ffmpeg"
+    - "gdown"
+    - "youtube-dl"
+    - "pandas"
+  # commands run after the environment is setup
+  # run:
+  #   - "echo env is ready!"
+  #   - "echo another command if needed"
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"

talknet-asd/dataLoader.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os, torch, numpy, cv2, random, glob, python_speech_features
+from scipy.io import wavfile
+from torchvision.transforms import RandomCrop
+def generate_audio_set(dataPath, batchList):
+    audioSet = {}
+    for line in batchList:
+        data = line.split('\t')
+        videoName = data[0][:11]
+        dataName = data[0]
+        _, audio = wavfile.read(os.path.join(dataPath, videoName, dataName + '.wav'))
+        audioSet[dataName] = audio
+    return audioSet
+def overlap(dataName, audio, audioSet):
+    noiseName =  random.sample(set(list(audioSet.keys())) - {dataName}, 1)[0]
+    noiseAudio = audioSet[noiseName]
+    snr = [random.uniform(-5, 5)]
+    if len(noiseAudio) < len(audio):
+        shortage = len(audio) - len(noiseAudio)
+        noiseAudio = numpy.pad(noiseAudio, (0, shortage), 'wrap')
+    else:
+        noiseAudio = noiseAudio[:len(audio)]
+    noiseDB = 10 * numpy.log10(numpy.mean(abs(noiseAudio ** 2)) + 1e-4)
+    cleanDB = 10 * numpy.log10(numpy.mean(abs(audio ** 2)) + 1e-4)
+    noiseAudio = numpy.sqrt(10 ** ((cleanDB - noiseDB - snr) / 10)) * noiseAudio
+    audio = audio + noiseAudio
+    return audio.astype(numpy.int16)
+def load_audio(data, dataPath, numFrames, audioAug, audioSet = None):
+    dataName = data[0]
+    fps = float(data[2])
+    audio = audioSet[dataName]
+    if audioAug == True:
+        augType = random.randint(0,1)
+        if augType == 1:
+            audio = overlap(dataName, audio, audioSet)
+        else:
+            audio = audio
+    # fps is not always 25, in order to align the visual, we modify the window and step in MFCC extraction process based on fps
+    audio = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025 * 25 / fps, winstep = 0.010 * 25 / fps)
+    maxAudio = int(numFrames * 4)
+    if audio.shape[0] < maxAudio:
+        shortage    = maxAudio - audio.shape[0]
+        audio     = numpy.pad(audio, ((0, shortage), (0,0)), 'wrap')
+    audio = audio[:int(round(numFrames * 4)),:]
+    return audio
+def load_visual(data, dataPath, numFrames, visualAug):
+    dataName = data[0]
+    videoName = data[0][:11]
+    faceFolderPath = os.path.join(dataPath, videoName, dataName)
+    faceFiles = glob.glob("%s/*.jpg"%faceFolderPath)
+    sortedFaceFiles = sorted(faceFiles, key=lambda data: (float(data.split('/')[-1][:-4])), reverse=False)
+    faces = []
+    H = 112
+    if visualAug == True:
+        new = int(H*random.uniform(0.7, 1))
+        x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new)
+        M = cv2.getRotationMatrix2D((H/2,H/2), random.uniform(-15, 15), 1)
+        augType = random.choice(['orig', 'flip', 'crop', 'rotate'])
+    else:
+        augType = 'orig'
+    for faceFile in sortedFaceFiles[:numFrames]:
+        face = cv2.imread(faceFile)
+        face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
+        face = cv2.resize(face, (H,H))
+        if augType == 'orig':
+            faces.append(face)
+        elif augType == 'flip':
+            faces.append(cv2.flip(face, 1))
+        elif augType == 'crop':
+            faces.append(cv2.resize(face[y:y+new, x:x+new] , (H,H)))
+        elif augType == 'rotate':
+            faces.append(cv2.warpAffine(face, M, (H,H)))
+    faces = numpy.array(faces)
+    return faces
+def load_label(data, numFrames):
+    res = []
+    labels = data[3].replace('[', '').replace(']', '')
+    labels = labels.split(',')
+    for label in labels:
+        res.append(int(label))
+    res = numpy.array(res[:numFrames])
+    return res
+class train_loader(object):
+    def __init__(self, trialFileName, audioPath, visualPath, batchSize, **kwargs):
+        self.audioPath  = audioPath
+        self.visualPath = visualPath
+        self.miniBatch = []
+        mixLst = open(trialFileName).read().splitlines()
+        # sort the training set by the length of the videos, shuffle them to make more videos in the same batch belong to different movies
+        sortedMixLst = sorted(mixLst, key=lambda data: (int(data.split('\t')[1]), int(data.split('\t')[-1])), reverse=True)
+        start = 0
+        while True:
+          length = int(sortedMixLst[start].split('\t')[1])
+          end = min(len(sortedMixLst), start + max(int(batchSize / length), 1))
+          self.miniBatch.append(sortedMixLst[start:end])
+          if end == len(sortedMixLst):
+              break
+          start = end
+    def __getitem__(self, index):
+        batchList    = self.miniBatch[index]
+        numFrames   = int(batchList[-1].split('\t')[1])
+        audioFeatures, visualFeatures, labels = [], [], []
+        audioSet = generate_audio_set(self.audioPath, batchList) # load the audios in this batch to do augmentation
+        for line in batchList:
+            data = line.split('\t')
+            audioFeatures.append(load_audio(data, self.audioPath, numFrames, audioAug = True, audioSet = audioSet))
+            visualFeatures.append(load_visual(data, self.visualPath,numFrames, visualAug = True))
+            labels.append(load_label(data, numFrames))
+        return torch.FloatTensor(numpy.array(audioFeatures)), \
+               torch.FloatTensor(numpy.array(visualFeatures)), \
+               torch.LongTensor(numpy.array(labels))
+    def __len__(self):
+        return len(self.miniBatch)
+class val_loader(object):
+    def __init__(self, trialFileName, audioPath, visualPath, **kwargs):
+        self.audioPath  = audioPath
+        self.visualPath = visualPath
+        self.miniBatch = open(trialFileName).read().splitlines()
+    def __getitem__(self, index):
+        line       = [self.miniBatch[index]]
+        numFrames  = int(line[0].split('\t')[1])
+        audioSet   = generate_audio_set(self.audioPath, line)
+        data = line[0].split('\t')
+        audioFeatures = [load_audio(data, self.audioPath, numFrames, audioAug = False, audioSet = audioSet)]
+        visualFeatures = [load_visual(data, self.visualPath,numFrames, visualAug = False)]
+        labels = [load_label(data, numFrames)]
+        return torch.FloatTensor(numpy.array(audioFeatures)), \
+               torch.FloatTensor(numpy.array(visualFeatures)), \
+               torch.LongTensor(numpy.array(labels))
+    def __len__(self):
+        return len(self.miniBatch)

talknet-asd/demoTalkNet.py ADDED Viewed

	@@ -0,0 +1,686 @@

+import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features
+from scipy import signal
+from shutil import rmtree
+from scipy.io import wavfile
+from scipy.interpolate import interp1d
+from sklearn.metrics import accuracy_score, f1_score
+from scenedetect.video_manager import VideoManager
+from scenedetect.scene_manager import SceneManager
+from scenedetect.frame_timecode import FrameTimecode
+from scenedetect.stats_manager import StatsManager
+from scenedetect.detectors import ContentDetector
+from model.faceDetector.s3fd import S3FD
+from talkNet import talkNet
+warnings.filterwarnings("ignore")
+parser = argparse.ArgumentParser(description="TalkNet Demo or Columnbia ASD Evaluation")
+parser.add_argument("--videoName", type=str, default="001", help="Demo video name")
+parser.add_argument(
+    "--videoFolder", type=str, default="demo", help="Path for inputs, tmps and outputs"
+)
+parser.add_argument(
+    "--pretrainModel",
+    type=str,
+    default="pretrain_TalkSet.model",
+    help="Path for the pretrained TalkNet model",
+)
+parser.add_argument(
+    "--nDataLoaderThread", type=int, default=10, help="Number of workers"
+)
+parser.add_argument(
+    "--facedetScale",
+    type=float,
+    default=0.25,
+    help="Scale factor for face detection, the frames will be scale to 0.25 orig",
+)
+parser.add_argument(
+    "--minTrack", type=int, default=10, help="Number of min frames for each shot"
+)
+parser.add_argument(
+    "--numFailedDet",
+    type=int,
+    default=10,
+    help="Number of missed detections allowed before tracking is stopped",
+)
+parser.add_argument(
+    "--minFaceSize", type=int, default=1, help="Minimum face size in pixels"
+)
+parser.add_argument("--cropScale", type=float, default=0.40, help="Scale bounding box")
+parser.add_argument("--start", type=int, default=0, help="The start time of the video")
+parser.add_argument(
+    "--duration",
+    type=int,
+    default=0,
+    help="The duration of the video, when set as 0, will extract the whole video",
+)
+parser.add_argument(
+    "--evalCol",
+    dest="evalCol",
+    action="store_true",
+    help="Evaluate on Columnbia dataset",
+)
+parser.add_argument(
+    "--colSavePath",
+    type=str,
+    default="/data08/col",
+    help="Path for inputs, tmps and outputs",
+)
+args = parser.parse_args()
+if os.path.isfile(args.pretrainModel) == False:  # Download the pretrained model
+    Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea"
+    cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel)
+    subprocess.call(cmd, shell=True, stdout=None)
+if args.evalCol == True:
+    # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using)
+    # 	              2. extract audio, extract video frames
+    #                 3. scend detection, face detection and face tracking
+    #                 4. active speaker detection for the detected face clips
+    #                 5. use iou to find the identity of each face clips, compute the F1 results
+    # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour
+    # The step 4 and 5 need less than 10 minutes
+    # Need about 20G space finally
+    # ```
+    args.videoName = "col"
+    args.videoFolder = args.colSavePath
+    args.savePath = os.path.join(args.videoFolder, args.videoName)
+    args.videoPath = os.path.join(args.videoFolder, args.videoName + ".mp4")
+    args.duration = 0
+    if os.path.isfile(args.videoPath) == False:  # Download video
+        link = "https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s"
+        cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link)
+        output = subprocess.call(cmd, shell=True, stdout=None)
+    if os.path.isdir(args.videoFolder + "/col_labels") == False:  # Download label
+        link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv"
+        cmd = "gdown --id %s -O %s" % (link, args.videoFolder + "/col_labels.tar.gz")
+        subprocess.call(cmd, shell=True, stdout=None)
+        cmd = "tar -xzvf %s -C %s" % (
+            args.videoFolder + "/col_labels.tar.gz",
+            args.videoFolder,
+        )
+        subprocess.call(cmd, shell=True, stdout=None)
+        os.remove(args.videoFolder + "/col_labels.tar.gz")
+else:
+    args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + ".*"))[0]
+    args.savePath = os.path.join(args.videoFolder, args.videoName)
+def scene_detect(args):
+    # CPU: Scene detection, output is the list of each shot's time duration
+    videoManager = VideoManager([args.videoFilePath])
+    statsManager = StatsManager()
+    sceneManager = SceneManager(statsManager)
+    sceneManager.add_detector(ContentDetector())
+    baseTimecode = videoManager.get_base_timecode()
+    videoManager.set_downscale_factor()
+    videoManager.start()
+    sceneManager.detect_scenes(frame_source=videoManager)
+    sceneList = sceneManager.get_scene_list(baseTimecode)
+    savePath = os.path.join(args.pyworkPath, "scene.pckl")
+    if sceneList == []:
+        sceneList = [
+            (videoManager.get_base_timecode(), videoManager.get_current_timecode())
+        ]
+    with open(savePath, "wb") as fil:
+        pickle.dump(sceneList, fil)
+        sys.stderr.write(
+            "%s - scenes detected %d\n" % (args.videoFilePath, len(sceneList))
+        )
+    return sceneList
+def inference_video(args):
+    # GPU: Face detection, output is the list contains the face location and score in this frame
+    DET = S3FD(device="cuda")
+    flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
+    flist.sort()
+    dets = []
+    for fidx, fname in enumerate(flist):
+        image = cv2.imread(fname)
+        imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale])
+        dets.append([])
+        for bbox in bboxes:
+            dets[-1].append(
+                {"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]}
+            )  # dets has the frames info, bbox info, conf info
+        sys.stderr.write(
+            "%s-%05d; %d dets\r" % (args.videoFilePath, fidx, len(dets[-1]))
+        )
+    savePath = os.path.join(args.pyworkPath, "faces.pckl")
+    with open(savePath, "wb") as fil:
+        pickle.dump(dets, fil)
+    return dets
+def bb_intersection_over_union(boxA, boxB, evalCol=False):
+    # CPU: IOU Function to calculate overlap between two image
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+    if evalCol == True:
+        iou = interArea / float(boxAArea)
+    else:
+        iou = interArea / float(boxAArea + boxBArea - interArea)
+    return iou
+def track_shot(args, sceneFaces):
+    # CPU: Face tracking
+    iouThres = 0.5  # Minimum IOU between consecutive face detections
+    tracks = []
+    while True:
+        track = []
+        for frameFaces in sceneFaces:
+            for face in frameFaces:
+                if track == []:
+                    track.append(face)
+                    frameFaces.remove(face)
+                elif face["frame"] - track[-1]["frame"] <= args.numFailedDet:
+                    iou = bb_intersection_over_union(face["bbox"], track[-1]["bbox"])
+                    if iou > iouThres:
+                        track.append(face)
+                        frameFaces.remove(face)
+                        continue
+                else:
+                    break
+        if track == []:
+            break
+        elif len(track) > args.minTrack:
+            frameNum = numpy.array([f["frame"] for f in track])
+            bboxes = numpy.array([numpy.array(f["bbox"]) for f in track])
+            frameI = numpy.arange(frameNum[0], frameNum[-1] + 1)
+            bboxesI = []
+            for ij in range(0, 4):
+                interpfn = interp1d(frameNum, bboxes[:, ij])
+                bboxesI.append(interpfn(frameI))
+            bboxesI = numpy.stack(bboxesI, axis=1)
+            if (
+                max(
+                    numpy.mean(bboxesI[:, 2] - bboxesI[:, 0]),
+                    numpy.mean(bboxesI[:, 3] - bboxesI[:, 1]),
+                )
+                > args.minFaceSize
+            ):
+                tracks.append({"frame": frameI, "bbox": bboxesI})
+    return tracks
+def crop_video(args, track, cropFile):
+    # CPU: crop the face clips
+    flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))  # Read the frames
+    flist.sort()
+    vOut = cv2.VideoWriter(
+        cropFile + "t.avi", cv2.VideoWriter_fourcc(*"XVID"), 25, (224, 224)
+    )  # Write video
+    dets = {"x": [], "y": [], "s": []}
+    for det in track["bbox"]:  # Read the tracks
+        dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
+        dets["y"].append((det[1] + det[3]) / 2)  # crop center x
+        dets["x"].append((det[0] + det[2]) / 2)  # crop center y
+    dets["s"] = signal.medfilt(dets["s"], kernel_size=13)  # Smooth detections
+    dets["x"] = signal.medfilt(dets["x"], kernel_size=13)
+    dets["y"] = signal.medfilt(dets["y"], kernel_size=13)
+    for fidx, frame in enumerate(track["frame"]):
+        cs = args.cropScale
+        bs = dets["s"][fidx]  # Detection box size
+        bsi = int(bs * (1 + 2 * cs))  # Pad videos by this amount
+        image = cv2.imread(flist[frame])
+        frame = numpy.pad(
+            image,
+            ((bsi, bsi), (bsi, bsi), (0, 0)),
+            "constant",
+            constant_values=(110, 110),
+        )
+        my = dets["y"][fidx] + bsi  # BBox center Y
+        mx = dets["x"][fidx] + bsi  # BBox center X
+        face = frame[
+            int(my - bs) : int(my + bs * (1 + 2 * cs)),
+            int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs)),
+        ]
+        vOut.write(cv2.resize(face, (224, 224)))
+    audioTmp = cropFile + ".wav"
+    audioStart = (track["frame"][0]) / 25
+    audioEnd = (track["frame"][-1] + 1) / 25
+    vOut.release()
+    command = (
+        "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic"
+        % (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)
+    )
+    output = subprocess.call(command, shell=True, stdout=None)  # Crop audio file
+    _, audio = wavfile.read(audioTmp)
+    command = (
+        "ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic"
+        % (cropFile, audioTmp, args.nDataLoaderThread, cropFile)
+    )  # Combine audio and video file
+    output = subprocess.call(command, shell=True, stdout=None)
+    os.remove(cropFile + "t.avi")
+    return {"track": track, "proc_track": dets}
+def extract_MFCC(file, outPath):
+    # CPU: extract mfcc
+    sr, audio = wavfile.read(file)
+    mfcc = python_speech_features.mfcc(audio, sr)  # (N_frames, 13)   [1s = 100 frames]
+    featuresPath = os.path.join(outPath, file.split("/")[-1].replace(".wav", ".npy"))
+    numpy.save(featuresPath, mfcc)
+def evaluate_network(files, args):
+    # GPU: active speaker detection by pretrained TalkNet
+    s = talkNet()
+    s.loadParameters(args.pretrainModel)
+    sys.stderr.write("Model %s loaded from previous state! \r\n" % args.pretrainModel)
+    s.eval()
+    allScores = []
+    # durationSet = {1,2,4,6} # To make the result more reliable
+    durationSet = {
+        1,
+        1,
+        1,
+        2,
+        2,
+        2,
+        3,
+        3,
+        4,
+        5,
+        6,
+    }  # Use this line can get more reliable result
+    for file in tqdm.tqdm(files, total=len(files)):
+        fileName = os.path.splitext(file.split("/")[-1])[0]  # Load audio and video
+        _, audio = wavfile.read(os.path.join(args.pycropPath, fileName + ".wav"))
+        audioFeature = python_speech_features.mfcc(
+            audio, 16000, numcep=13, winlen=0.025, winstep=0.010
+        )
+        video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + ".avi"))
+        videoFeature = []
+        while video.isOpened():
+            ret, frames = video.read()
+            if ret == True:
+                face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
+                face = cv2.resize(face, (224, 224))
+                face = face[
+                    int(112 - (112 / 2)) : int(112 + (112 / 2)),
+                    int(112 - (112 / 2)) : int(112 + (112 / 2)),
+                ]
+                videoFeature.append(face)
+            else:
+                break
+        video.release()
+        videoFeature = numpy.array(videoFeature)
+        length = min(
+            (audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100,
+            videoFeature.shape[0] / 25,
+        )
+        audioFeature = audioFeature[: int(round(length * 100)), :]
+        videoFeature = videoFeature[: int(round(length * 25)), :, :]
+        allScore = []  # Evaluation use TalkNet
+        for duration in durationSet:
+            batchSize = int(math.ceil(length / duration))
+            scores = []
+            with torch.no_grad():
+                for i in range(batchSize):
+                    inputA = (
+                        torch.FloatTensor(
+                            audioFeature[
+                                i * duration * 100 : (i + 1) * duration * 100, :
+                            ]
+                        )
+                        .unsqueeze(0)
+                        .cuda()
+                    )
+                    inputV = (
+                        torch.FloatTensor(
+                            videoFeature[
+                                i * duration * 25 : (i + 1) * duration * 25, :, :
+                            ]
+                        )
+                        .unsqueeze(0)
+                        .cuda()
+                    )
+                    embedA = s.model.forward_audio_frontend(inputA)
+                    embedV = s.model.forward_visual_frontend(inputV)
+                    embedA, embedV = s.model.forward_cross_attention(embedA, embedV)
+                    out = s.model.forward_audio_visual_backend(embedA, embedV)
+                    score = s.lossAV.forward(out, labels=None)
+                    scores.extend(score)
+            allScore.append(scores)
+        allScore = numpy.round((numpy.mean(numpy.array(allScore), axis=0)), 1).astype(
+            float
+        )
+        allScores.append(allScore)
+    return allScores
+def visualization(tracks, scores, args):
+    # CPU: visulize the result for video format
+    flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
+    flist.sort()
+    faces = [[] for i in range(len(flist))]
+    for tidx, track in enumerate(tracks):
+        score = scores[tidx]
+        for fidx, frame in enumerate(track["track"]["frame"].tolist()):
+            s = score[
+                max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)
+            ]  # average smoothing
+            s = numpy.mean(s)
+            faces[frame].append(
+                {
+                    "track": tidx,
+                    "score": float(s),
+                    "s": track["proc_track"]["s"][fidx],
+                    "x": track["proc_track"]["x"][fidx],
+                    "y": track["proc_track"]["y"][fidx],
+                }
+            )
+    firstImage = cv2.imread(flist[0])
+    fw = firstImage.shape[1]
+    fh = firstImage.shape[0]
+    vOut = cv2.VideoWriter(
+        os.path.join(args.pyaviPath, "video_only.avi"),
+        cv2.VideoWriter_fourcc(*"XVID"),
+        25,
+        (fw, fh),
+    )
+    colorDict = {0: 0, 1: 255}
+    for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
+        image = cv2.imread(fname)
+        for face in faces[fidx]:
+            clr = colorDict[int((face["score"] >= 0))]
+            txt = round(face["score"], 1)
+            cv2.rectangle(
+                image,
+                (int(face["x"] - face["s"]), int(face["y"] - face["s"])),
+                (int(face["x"] + face["s"]), int(face["y"] + face["s"])),
+                (0, clr, 255 - clr),
+                10,
+            )
+            cv2.putText(
+                image,
+                "%s" % (txt),
+                (int(face["x"] - face["s"]), int(face["y"] - face["s"])),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1.5,
+                (0, clr, 255 - clr),
+                5,
+            )
+        vOut.write(image)
+    vOut.release()
+    command = (
+        "ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic"
+        % (
+            os.path.join(args.pyaviPath, "video_only.avi"),
+            os.path.join(args.pyaviPath, "audio.wav"),
+            args.nDataLoaderThread,
+            os.path.join(args.pyaviPath, "video_out.avi"),
+        )
+    )
+    output = subprocess.call(command, shell=True, stdout=None)
+def evaluate_col_ASD(tracks, scores, args):
+    txtPath = args.videoFolder + "/col_labels/fusion/*.txt"  # Load labels
+    predictionSet = {}
+    for name in {"long", "bell", "boll", "lieb", "sick", "abbas"}:
+        predictionSet[name] = [[], []]
+    dictGT = {}
+    txtFiles = glob.glob("%s" % txtPath)
+    for file in txtFiles:
+        lines = open(file).read().splitlines()
+        idName = file.split("/")[-1][:-4]
+        for line in lines:
+            data = line.split("\t")
+            frame = int(int(data[0]) / 29.97 * 25)
+            x1 = int(data[1])
+            y1 = int(data[2])
+            x2 = int(data[1]) + int(data[3])
+            y2 = int(data[2]) + int(data[3])
+            gt = int(data[4])
+            if frame in dictGT:
+                dictGT[frame].append([x1, y1, x2, y2, gt, idName])
+            else:
+                dictGT[frame] = [[x1, y1, x2, y2, gt, idName]]
+    flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))  # Load files
+    flist.sort()
+    faces = [[] for i in range(len(flist))]
+    for tidx, track in enumerate(tracks):
+        score = scores[tidx]
+        for fidx, frame in enumerate(track["track"]["frame"].tolist()):
+            s = numpy.mean(
+                score[max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)]
+            )  # average smoothing
+            faces[frame].append(
+                {
+                    "track": tidx,
+                    "score": float(s),
+                    "s": track["proc_track"]["s"][fidx],
+                    "x": track["proc_track"]["x"][fidx],
+                    "y": track["proc_track"]["y"][fidx],
+                }
+            )
+    for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
+        if fidx in dictGT:  # This frame has label
+            for gtThisFrame in dictGT[fidx]:  # What this label is ?
+                faceGT = gtThisFrame[0:4]
+                labelGT = gtThisFrame[4]
+                idGT = gtThisFrame[5]
+                ious = []
+                for face in faces[fidx]:  # Find the right face in my result
+                    faceLocation = [
+                        int(face["x"] - face["s"]),
+                        int(face["y"] - face["s"]),
+                        int(face["x"] + face["s"]),
+                        int(face["y"] + face["s"]),
+                    ]
+                    faceLocation_new = [
+                        int(face["x"] - face["s"]) // 2,
+                        int(face["y"] - face["s"]) // 2,
+                        int(face["x"] + face["s"]) // 2,
+                        int(face["y"] + face["s"]) // 2,
+                    ]
+                    iou = bb_intersection_over_union(
+                        faceLocation_new, faceGT, evalCol=True
+                    )
+                    if iou > 0.5:
+                        ious.append([iou, round(face["score"], 2)])
+                if len(ious) > 0:  # Find my result
+                    ious.sort()
+                    labelPredict = ious[-1][1]
+                else:
+                    labelPredict = 0
+                x1 = faceGT[0]
+                y1 = faceGT[1]
+                width = faceGT[2] - faceGT[0]
+                predictionSet[idGT][0].append(labelPredict)
+                predictionSet[idGT][1].append(labelGT)
+    names = ["long", "bell", "boll", "lieb", "sick", "abbas"]  # Evaluate
+    names.sort()
+    F1s = 0
+    for i in names:
+        scores = numpy.array(predictionSet[i][0])
+        labels = numpy.array(predictionSet[i][1])
+        scores = numpy.int64(scores > 0)
+        F1 = f1_score(labels, scores)
+        ACC = accuracy_score(labels, scores)
+        if i != "abbas":
+            F1s += F1
+            print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1))
+    print("Average F1:%.2f" % (100 * (F1s / 5)))
+# Main function
+def main():
+    # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python).
+    # ```
+    # .
+    # ├── pyavi
+    # │   ├── audio.wav (Audio from input video)
+    # │   ├── video.avi (Copy of the input video)
+    # │   ├── video_only.avi (Output video without audio)
+    # │   └── video_out.avi  (Output video with audio)
+    # ├── pycrop (The detected face videos and audios)
+    # │   ├── 000000.avi
+    # │   ├── 000000.wav
+    # │   ├── 000001.avi
+    # │   ├── 000001.wav
+    # │   └── ...
+    # ├── pyframes (All the video frames in this video)
+    # │   ├── 000001.jpg
+    # │   ├── 000002.jpg
+    # │   └── ...
+    # └── pywork
+    #     ├── faces.pckl (face detection result)
+    #     ├── scene.pckl (scene detection result)
+    #     ├── scores.pckl (ASD result)
+    #     └── tracks.pckl (face tracking result)
+    # ```
+    # Initialization
+    args.pyaviPath = os.path.join(args.savePath, "pyavi")
+    args.pyframesPath = os.path.join(args.savePath, "pyframes")
+    args.pyworkPath = os.path.join(args.savePath, "pywork")
+    args.pycropPath = os.path.join(args.savePath, "pycrop")
+    if os.path.exists(args.savePath):
+        rmtree(args.savePath)
+    os.makedirs(
+        args.pyaviPath, exist_ok=True
+    )  # The path for the input video, input audio, output video
+    os.makedirs(args.pyframesPath, exist_ok=True)  # Save all the video frames
+    os.makedirs(
+        args.pyworkPath, exist_ok=True
+    )  # Save the results in this process by the pckl method
+    os.makedirs(
+        args.pycropPath, exist_ok=True
+    )  # Save the detected face clips (audio+video) in this process
+    # Extract video
+    args.videoFilePath = os.path.join(args.pyaviPath, "video.avi")
+    # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration'
+    if args.duration == 0:
+        command = (
+            "ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic"
+            % (args.videoPath, args.nDataLoaderThread, args.videoFilePath)
+        )
+    else:
+        command = (
+            "ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic"
+            % (
+                args.videoPath,
+                args.nDataLoaderThread,
+                args.start,
+                args.start + args.duration,
+                args.videoFilePath,
+            )
+        )
+    subprocess.call(command, shell=True, stdout=None)
+    sys.stderr.write(
+        time.strftime("%Y-%m-%d %H:%M:%S")
+        + " Extract the video and save in %s \r\n" % (args.videoFilePath)
+    )
+    # Extract audio
+    args.audioFilePath = os.path.join(args.pyaviPath, "audio.wav")
+    command = (
+        "ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic"
+        % (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath)
+    )
+    subprocess.call(command, shell=True, stdout=None)
+    sys.stderr.write(
+        time.strftime("%Y-%m-%d %H:%M:%S")
+        + " Extract the audio and save in %s \r\n" % (args.audioFilePath)
+    )
+    # Extract the video frames
+    command = "ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % (
+        args.videoFilePath,
+        args.nDataLoaderThread,
+        os.path.join(args.pyframesPath, "%06d.jpg"),
+    )
+    subprocess.call(command, shell=True, stdout=None)
+    sys.stderr.write(
+        time.strftime("%Y-%m-%d %H:%M:%S")
+        + " Extract the frames and save in %s \r\n" % (args.pyframesPath)
+    )
+    # Scene detection for the video frames
+    scene = scene_detect(args)
+    sys.stderr.write(
+        time.strftime("%Y-%m-%d %H:%M:%S")
+        + " Scene detection and save in %s \r\n" % (args.pyworkPath)
+    )
+    # Face detection for the video frames
+    faces = inference_video(args)
+    sys.stderr.write(
+        time.strftime("%Y-%m-%d %H:%M:%S")
+        + " Face detection and save in %s \r\n" % (args.pyworkPath)
+    )
+    # Face tracking
+    allTracks, vidTracks = [], []
+    for shot in scene:
+        if (
+            shot[1].frame_num - shot[0].frame_num >= args.minTrack
+        ):  # Discard the shot frames less than minTrack frames
+            allTracks.extend(
+                track_shot(args, faces[shot[0].frame_num : shot[1].frame_num])
+            )  # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces
+    sys.stderr.write(
+        time.strftime("%Y-%m-%d %H:%M:%S")
+        + " Face track and detected %d tracks \r\n" % len(allTracks)
+    )
+    # Face clips cropping
+    for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)):
+        vidTracks.append(
+            crop_video(args, track, os.path.join(args.pycropPath, "%05d" % ii))
+        )
+    savePath = os.path.join(args.pyworkPath, "tracks.pckl")
+    with open(savePath, "wb") as fil:
+        pickle.dump(vidTracks, fil)
+    sys.stderr.write(
+        time.strftime("%Y-%m-%d %H:%M:%S")
+        + " Face Crop and saved in %s tracks \r\n" % args.pycropPath
+    )
+    fil = open(savePath, "rb")
+    vidTracks = pickle.load(fil)
+    # Active Speaker Detection by TalkNet
+    files = glob.glob("%s/*.avi" % args.pycropPath)
+    files.sort()
+    scores = evaluate_network(files, args)
+    savePath = os.path.join(args.pyworkPath, "scores.pckl")
+    with open(savePath, "wb") as fil:
+        pickle.dump(scores, fil)
+    sys.stderr.write(
+        time.strftime("%Y-%m-%d %H:%M:%S")
+        + " Scores extracted and saved in %s \r\n" % args.pyworkPath
+    )
+    if args.evalCol == True:
+        evaluate_col_ASD(
+            vidTracks, scores, args
+        )  # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want
+        quit()
+    else:
+        # Visualization, save the result as the new video
+        visualization(vidTracks, scores, args)
+if __name__ == "__main__":
+    main()

talknet-asd/export_onnx_cpu.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import torch
+from loss import lossAV
+from model.talkNetModel import talkNetModel
+class TalkNetCPU(torch.nn.Module):
+    """CPU-only wrapper for TalkNet export."""
+    def __init__(self, ckpt_path: str):
+        super().__init__()
+        self.model = talkNetModel()
+        self.lossAV = lossAV()
+        self.ckpt_path = ckpt_path
+    def load_parameters(self) -> None:
+        """Load state_dict saved by talkNet.saveParameters (handles module. prefix)."""
+        self_state = self.state_dict()
+        loaded_state = torch.load(self.ckpt_path, map_location="cpu")
+        for name, param in loaded_state.items():
+            orig_name = name
+            target_name = name
+            if target_name not in self_state:
+                target_name = target_name.replace("module.", "")
+            if target_name not in self_state:
+                print(f"{orig_name} is not in the model.")
+                continue
+            if self_state[target_name].shape != loaded_state[orig_name].shape:
+                print(
+                    f"Shape mismatch {orig_name}: "
+                    f"model {self_state[target_name].shape}, "
+                    f"loaded {loaded_state[orig_name].shape}"
+                )
+                continue
+            self_state[target_name].copy_(param)
+    def forward(self, audio_mfcc: torch.Tensor, video_gray: torch.Tensor) -> torch.Tensor:
+        """
+        audio_mfcc: (B, Ta, 13)
+        video_gray: (B, Tv, 224, 224)
+        returns logits: (B*, 2)
+        """
+        audio_embed = self.model.forward_audio_frontend(audio_mfcc)
+        visual_embed = self.model.forward_visual_frontend(video_gray)
+        audio_embed, visual_embed = self.model.forward_cross_attention(
+            audio_embed, visual_embed
+        )
+        av_embed = self.model.forward_audio_visual_backend(audio_embed, visual_embed)
+        logits = self.lossAV.FC(av_embed)
+        return logits
+def main() -> None:
+    ckpt_path = os.environ.get("CKPT_PATH", "model/pretrain_TalkSet.model")
+    out_path = os.environ.get("OUT_PATH", "talknet_asd_cpu.onnx")
+    model = TalkNetCPU(ckpt_path)
+    model.load_parameters()
+    model.eval()
+    # Dummy inputs only to build the graph; real lengths are dynamic via dynamic_axes.
+    dummy_audio = torch.randn(1, 100, 13)  # ~1s MFCC (100 frames)
+    # Model expects 112x112 (demoTalkNet crops 224->center 112)
+    dummy_video = torch.randn(1, 25, 112, 112)  # 25 frames of 112x112 gray crops
+    torch.onnx.export(
+        model,
+        (dummy_audio, dummy_video),
+        out_path,
+        input_names=["audio_mfcc", "video_gray"],
+        output_names=["logits"],
+        dynamic_axes={
+            "audio_mfcc": {0: "batch", 1: "time_audio"},
+            "video_gray": {0: "batch", 1: "time_video"},
+            "logits": {0: "time_any"},
+        },
+        opset_version=14,
+    )
+    print(f"Saved ONNX to {out_path}")
+if __name__ == "__main__":
+    main()

talknet-asd/loss.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class lossAV(nn.Module):
+	def __init__(self):
+		super(lossAV, self).__init__()
+		self.criterion = nn.CrossEntropyLoss()
+		self.FC        = nn.Linear(256, 2)
+	def forward(self, x, labels=None):
+		x = x.squeeze(1)
+		x = self.FC(x)
+		if labels == None:
+			predScore = x[:,1]
+			predScore = predScore.t()
+			predScore = predScore.view(-1).detach().cpu().numpy()
+			return predScore
+		else:
+			nloss = self.criterion(x, labels)
+			predScore = F.softmax(x, dim = -1)
+			predLabel = torch.round(F.softmax(x, dim = -1))[:,1]
+			correctNum = (predLabel == labels).sum().float()
+			return nloss, predScore, predLabel, correctNum
+class lossA(nn.Module):
+	def __init__(self):
+		super(lossA, self).__init__()
+		self.criterion = nn.CrossEntropyLoss()
+		self.FC        = nn.Linear(128, 2)
+	def forward(self, x, labels):
+		x = x.squeeze(1)
+		x = self.FC(x)
+		nloss = self.criterion(x, labels)
+		return nloss
+class lossV(nn.Module):
+	def __init__(self):
+		super(lossV, self).__init__()
+		self.criterion = nn.CrossEntropyLoss()
+		self.FC        = nn.Linear(128, 2)
+	def forward(self, x, labels):
+		x = x.squeeze(1)
+		x = self.FC(x)
+		nloss = self.criterion(x, labels)
+		return nloss

talknet-asd/model/attentionLayer.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn import MultiheadAttention
+class attentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.1):
+        super(attentionLayer, self).__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.linear1 = nn.Linear(d_model, d_model * 4)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_model * 4, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = F.relu
+    def forward(self, src, tar):
+        # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
+        src = src.transpose(0, 1) # B, T, C -> T, B, C
+        tar = tar.transpose(0, 1) # B, T, C -> T, B, C
+        src2 = self.self_attn(tar, src, src, attn_mask=None,
+                              key_padding_mask=None)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        src = src.transpose(0, 1) # T, B, C -> B, T, C
+        return src

talknet-asd/model/audioEncoder.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SEBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=8):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+                nn.Linear(channel, channel // reduction),
+                nn.ReLU(inplace=True),
+                nn.Linear(channel // reduction, channel),
+                nn.Sigmoid()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+class audioEncoder(nn.Module):
+    def __init__(self, layers, num_filters, **kwargs):
+        super(audioEncoder, self).__init__()
+        block = SEBasicBlock
+        self.inplanes   = num_filters[0]
+        self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(num_filters[0])
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(block, num_filters[0], layers[0])
+        self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
+        self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
+        self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
+        out_dim = num_filters[3] * block.expansion
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = torch.mean(x, dim=2, keepdim=True)
+        x = x.view((x.size()[0], x.size()[1], -1))
+        x = x.transpose(1, 2)
+        return x

talknet-asd/model/faceDetector/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Face detector
2	+
3	+ This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.

talknet-asd/model/faceDetector/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .s3fd import S3FD

talknet-asd/model/faceDetector/s3fd/__init__.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import time, os, sys, subprocess
+import numpy as np
+import cv2
+import torch
+from torchvision import transforms
+from .nets import S3FDNet
+from .box_utils import nms_
+PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth'
+if os.path.isfile(PATH_WEIGHT) == False:
+    Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt"
+    cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT)
+    subprocess.call(cmd, shell=True, stdout=None)
+img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
+class S3FD():
+    def __init__(self, device='cuda'):
+        tstamp = time.time()
+        self.device = device
+        # print('[S3FD] loading with', self.device)
+        self.net = S3FDNet(device=self.device).to(self.device)
+        PATH = os.path.join(os.getcwd(), PATH_WEIGHT)
+        state_dict = torch.load(PATH, map_location=self.device)
+        self.net.load_state_dict(state_dict)
+        self.net.eval()
+        # print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
+    def detect_faces(self, image, conf_th=0.8, scales=[1]):
+        w, h = image.shape[1], image.shape[0]
+        bboxes = np.empty(shape=(0, 5))
+        with torch.no_grad():
+            for s in scales:
+                scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
+                scaled_img = np.swapaxes(scaled_img, 1, 2)
+                scaled_img = np.swapaxes(scaled_img, 1, 0)
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                scaled_img = scaled_img.astype('float32')
+                scaled_img -= img_mean
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
+                y = self.net(x)
+                detections = y.data
+                scale = torch.Tensor([w, h, w, h])
+                for i in range(detections.size(1)):
+                    j = 0
+                    while detections[0, i, j, 0] > conf_th:
+                        score = detections[0, i, j, 0]
+                        pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
+                        bbox = (pt[0], pt[1], pt[2], pt[3], score)
+                        bboxes = np.vstack((bboxes, bbox))
+                        j += 1
+            keep = nms_(bboxes, 0.1)
+            bboxes = bboxes[keep]
+        return bboxes

talknet-asd/model/faceDetector/s3fd/box_utils.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import numpy as np
+from itertools import product as product
+import torch
+from torch.autograd import Function
+def nms_(dets, thresh):
+    """
+    Courtesy of Ross Girshick
+    [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(int(i))
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+    return np.array(keep).astype(int)
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+    boxes = torch.cat((
+        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+def nms(boxes, scores, overlap=0.5, top_k=200):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    keep = scores.new(scores.size(0)).zero_().long()
+    if boxes.numel() == 0:
+        return keep, 0
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    area = torch.mul(x2 - x1, y2 - y1)
+    v, idx = scores.sort(0)  # sort in ascending order
+    # I = I[v >= 0.01]
+    idx = idx[-top_k:]  # indices of the top-k largest vals
+    xx1 = boxes.new()
+    yy1 = boxes.new()
+    xx2 = boxes.new()
+    yy2 = boxes.new()
+    w = boxes.new()
+    h = boxes.new()
+    # keep = torch.Tensor()
+    count = 0
+    while idx.numel() > 0:
+        i = idx[-1]  # index of current largest val
+        # keep.append(i)
+        keep[count] = i
+        count += 1
+        if idx.size(0) == 1:
+            break
+        idx = idx[:-1]  # remove kept element from view
+        # load bboxes of next highest vals
+        torch.index_select(x1, 0, idx, out=xx1)
+        torch.index_select(y1, 0, idx, out=yy1)
+        torch.index_select(x2, 0, idx, out=xx2)
+        torch.index_select(y2, 0, idx, out=yy2)
+        # store element-wise max with next highest score
+        xx1 = torch.clamp(xx1, min=x1[i])
+        yy1 = torch.clamp(yy1, min=y1[i])
+        xx2 = torch.clamp(xx2, max=x2[i])
+        yy2 = torch.clamp(yy2, max=y2[i])
+        w.resize_as_(xx2)
+        h.resize_as_(yy2)
+        w = xx2 - xx1
+        h = yy2 - yy1
+        # check sizes of xx1 and xx2.. after each iteration
+        w = torch.clamp(w, min=0.0)
+        h = torch.clamp(h, min=0.0)
+        inter = w * h
+        # IoU = i / (area(a) + area(b) - i)
+        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+        union = (rem_areas - inter) + area[i]
+        IoU = inter / union  # store result in iou
+        # keep only elements with an IoU <= overlap
+        idx = idx[IoU.le(overlap)]
+    return keep, count
+class Detect(object):
+    def __init__(self, num_classes=2,
+                    top_k=750, nms_thresh=0.3, conf_thresh=0.05,
+                    variance=[0.1, 0.2], nms_top_k=5000):
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.nms_thresh = nms_thresh
+        self.conf_thresh = conf_thresh
+        self.variance = variance
+        self.nms_top_k = nms_top_k
+    def forward(self, loc_data, conf_data, prior_data):
+        num = loc_data.size(0)
+        num_priors = prior_data.size(0)
+        conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
+        batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
+        batch_priors = batch_priors.contiguous().view(-1, 4)
+        decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
+        decoded_boxes = decoded_boxes.view(num, num_priors, 4)
+        output = torch.zeros(num, self.num_classes, self.top_k, 5)
+        for i in range(num):
+            boxes = decoded_boxes[i].clone()
+            conf_scores = conf_preds[i].clone()
+            for cl in range(1, self.num_classes):
+                c_mask = conf_scores[cl].gt(self.conf_thresh)
+                scores = conf_scores[cl][c_mask]
+                if scores.dim() == 0:
+                    continue
+                l_mask = c_mask.unsqueeze(1).expand_as(boxes)
+                boxes_ = boxes[l_mask].view(-1, 4)
+                ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
+                count = count if count < self.top_k else self.top_k
+                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
+        return output
+class PriorBox(object):
+    def __init__(self, input_size, feature_maps,
+                    variance=[0.1, 0.2],
+                    min_sizes=[16, 32, 64, 128, 256, 512],
+                    steps=[4, 8, 16, 32, 64, 128],
+                    clip=False):
+        super(PriorBox, self).__init__()
+        self.imh = input_size[0]
+        self.imw = input_size[1]
+        self.feature_maps = feature_maps
+        self.variance = variance
+        self.min_sizes = min_sizes
+        self.steps = steps
+        self.clip = clip
+    def forward(self):
+        mean = []
+        for k, fmap in enumerate(self.feature_maps):
+            feath = fmap[0]
+            featw = fmap[1]
+            for i, j in product(range(feath), range(featw)):
+                f_kw = self.imw / self.steps[k]
+                f_kh = self.imh / self.steps[k]
+                cx = (j + 0.5) / f_kw
+                cy = (i + 0.5) / f_kh
+                s_kw = self.min_sizes[k] / self.imw
+                s_kh = self.min_sizes[k] / self.imh
+                mean += [cx, cy, s_kw, s_kh]
+        output = torch.FloatTensor(mean).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output

talknet-asd/model/faceDetector/s3fd/nets.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from .box_utils import Detect, PriorBox
+class L2Norm(nn.Module):
+    def __init__(self, n_channels, scale):
+        super(L2Norm, self).__init__()
+        self.n_channels = n_channels
+        self.gamma = scale or None
+        self.eps = 1e-10
+        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
+        self.reset_parameters()
+    def reset_parameters(self):
+        init.constant_(self.weight, self.gamma)
+    def forward(self, x):
+        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
+        x = torch.div(x, norm)
+        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
+        return out
+class S3FDNet(nn.Module):
+    def __init__(self, device='cuda'):
+        super(S3FDNet, self).__init__()
+        self.device = device
+        self.vgg = nn.ModuleList([
+            nn.Conv2d(3, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(64, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(128, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2, ceil_mode=True),
+            nn.Conv2d(256, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(1024, 1024, 1, 1),
+            nn.ReLU(inplace=True),
+        ])
+        self.L2Norm3_3 = L2Norm(256, 10)
+        self.L2Norm4_3 = L2Norm(512, 8)
+        self.L2Norm5_3 = L2Norm(512, 5)
+        self.extras = nn.ModuleList([
+            nn.Conv2d(1024, 256, 1, 1),
+            nn.Conv2d(256, 512, 3, 2, padding=1),
+            nn.Conv2d(512, 128, 1, 1),
+            nn.Conv2d(128, 256, 3, 2, padding=1),
+        ])
+        self.loc = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(1024, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+        ])
+        self.conf = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(1024, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(256, 2, 3, 1, padding=1),
+        ])
+        self.softmax = nn.Softmax(dim=-1)
+        self.detect = Detect()
+    def forward(self, x):
+        size = x.size()[2:]
+        sources = list()
+        loc = list()
+        conf = list()
+        for k in range(16):
+            x = self.vgg[k](x)
+        s = self.L2Norm3_3(x)
+        sources.append(s)
+        for k in range(16, 23):
+            x = self.vgg[k](x)
+        s = self.L2Norm4_3(x)
+        sources.append(s)
+        for k in range(23, 30):
+            x = self.vgg[k](x)
+        s = self.L2Norm5_3(x)
+        sources.append(s)
+        for k in range(30, len(self.vgg)):
+            x = self.vgg[k](x)
+        sources.append(x)
+        # apply extra layers and cache source layer outputs
+        for k, v in enumerate(self.extras):
+            x = F.relu(v(x), inplace=True)
+            if k % 2 == 1:
+                sources.append(x)
+        # apply multibox head to source layers
+        loc_x = self.loc[0](sources[0])
+        conf_x = self.conf[0](sources[0])
+        max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
+        conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
+        loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
+        conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
+        for i in range(1, len(sources)):
+            x = sources[i]
+            conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
+            loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
+        features_maps = []
+        for i in range(len(loc)):
+            feat = []
+            feat += [loc[i].size(1), loc[i].size(2)]
+            features_maps += [feat]
+        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
+        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
+        with torch.no_grad():
+            self.priorbox = PriorBox(size, features_maps)
+            self.priors = self.priorbox.forward()
+        output = self.detect.forward(
+            loc.view(loc.size(0), -1, 4),
+            self.softmax(conf.view(conf.size(0), -1, 2)),
+            self.priors.type(type(x.data)).to(self.device)
+        )
+        return output

talknet-asd/model/talkNetModel.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import torch.nn as nn
+from model.audioEncoder      import audioEncoder
+from model.visualEncoder     import visualFrontend, visualTCN, visualConv1D
+from model.attentionLayer    import attentionLayer
+class talkNetModel(nn.Module):
+    def __init__(self):
+        super(talkNetModel, self).__init__()
+        # Visual Temporal Encoder
+        self.visualFrontend  = visualFrontend() # Visual Frontend
+        # self.visualFrontend.load_state_dict(torch.load('visual_frontend.pt', map_location="cuda"))
+        # for param in self.visualFrontend.parameters():
+        #     param.requires_grad = False
+        self.visualTCN       = visualTCN()      # Visual Temporal Network TCN
+        self.visualConv1D    = visualConv1D()   # Visual Temporal Network Conv1d
+        # Audio Temporal Encoder
+        self.audioEncoder  = audioEncoder(layers = [3, 4, 6, 3],  num_filters = [16, 32, 64, 128])
+        # Audio-visual Cross Attention
+        self.crossA2V = attentionLayer(d_model = 128, nhead = 8)
+        self.crossV2A = attentionLayer(d_model = 128, nhead = 8)
+        # Audio-visual Self Attention
+        self.selfAV = attentionLayer(d_model = 256, nhead = 8)
+    def forward_visual_frontend(self, x):
+        B, T, W, H = x.shape
+        x = x.view(B*T, 1, 1, W, H)
+        x = (x / 255 - 0.4161) / 0.1688
+        x = self.visualFrontend(x)
+        x = x.view(B, T, 512)
+        x = x.transpose(1,2)
+        x = self.visualTCN(x)
+        x = self.visualConv1D(x)
+        x = x.transpose(1,2)
+        return x
+    def forward_audio_frontend(self, x):
+        x = x.unsqueeze(1).transpose(2, 3)
+        x = self.audioEncoder(x)
+        return x
+    def forward_cross_attention(self, x1, x2):
+        x1_c = self.crossA2V(src = x1, tar = x2)
+        x2_c = self.crossV2A(src = x2, tar = x1)
+        return x1_c, x2_c
+    def forward_audio_visual_backend(self, x1, x2):
+        x = torch.cat((x1,x2), 2)
+        x = self.selfAV(src = x, tar = x)
+        x = torch.reshape(x, (-1, 256))
+        return x
+    def forward_audio_backend(self,x):
+        x = torch.reshape(x, (-1, 128))
+        return x
+    def forward_visual_backend(self,x):
+        x = torch.reshape(x, (-1, 128))
+        return x

talknet-asd/model/visualEncoder.py ADDED Viewed

	@@ -0,0 +1,172 @@

+##
+# ResNet18 Pretrained network to extract lip embedding
+# This code is modified based on https://github.com/lordmartian/deep_avsr
+##
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ResNetLayer(nn.Module):
+    """
+    A ResNet layer used to build the ResNet network.
+    Architecture:
+    --> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu -->
+     |                        |   |                                    |
+     -----> downsample ------>    ------------------------------------->
+    """
+    def __init__(self, inplanes, outplanes, stride):
+        super(ResNetLayer, self).__init__()
+        self.conv1a = nn.Conv2d(inplanes, outplanes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        self.conv2a = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.stride = stride
+        self.downsample = nn.Conv2d(inplanes, outplanes, kernel_size=(1,1), stride=stride, bias=False)
+        self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        self.conv1b = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        self.conv2b = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        return
+    def forward(self, inputBatch):
+        batch = F.relu(self.bn1a(self.conv1a(inputBatch)))
+        batch = self.conv2a(batch)
+        if self.stride == 1:
+            residualBatch = inputBatch
+        else:
+            residualBatch = self.downsample(inputBatch)
+        batch = batch + residualBatch
+        intermediateBatch = batch
+        batch = F.relu(self.outbna(batch))
+        batch = F.relu(self.bn1b(self.conv1b(batch)))
+        batch = self.conv2b(batch)
+        residualBatch = intermediateBatch
+        batch = batch + residualBatch
+        outputBatch = F.relu(self.outbnb(batch))
+        return outputBatch
+class ResNet(nn.Module):
+    """
+    An 18-layer ResNet architecture.
+    """
+    def __init__(self):
+        super(ResNet, self).__init__()
+        self.layer1 = ResNetLayer(64, 64, stride=1)
+        self.layer2 = ResNetLayer(64, 128, stride=2)
+        self.layer3 = ResNetLayer(128, 256, stride=2)
+        self.layer4 = ResNetLayer(256, 512, stride=2)
+        self.avgpool = nn.AvgPool2d(kernel_size=(4,4), stride=(1,1))
+        return
+    def forward(self, inputBatch):
+        batch = self.layer1(inputBatch)
+        batch = self.layer2(batch)
+        batch = self.layer3(batch)
+        batch = self.layer4(batch)
+        outputBatch = self.avgpool(batch)
+        return outputBatch
+class GlobalLayerNorm(nn.Module):
+    def __init__(self, channel_size):
+        super(GlobalLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) #[M, 1, 1]
+        var = (torch.pow(y-mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
+        gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta
+        return gLN_y
+class visualFrontend(nn.Module):
+    """
+    A visual feature extraction module. Generates a 512-dim feature vector per video frame.
+    Architecture: A 3D convolution block followed by an 18-layer ResNet.
+    """
+    def __init__(self):
+        super(visualFrontend, self).__init__()
+        self.frontend3D = nn.Sequential(
+                            nn.Conv3d(1, 64, kernel_size=(5,7,7), stride=(1,2,2), padding=(2,3,3), bias=False),
+                            nn.BatchNorm3d(64, momentum=0.01, eps=0.001),
+                            nn.ReLU(),
+                            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1))
+                        )
+        self.resnet = ResNet()
+        return
+    def forward(self, inputBatch):
+        inputBatch = inputBatch.transpose(0, 1).transpose(1, 2)
+        batchsize = inputBatch.shape[0]
+        batch = self.frontend3D(inputBatch)
+        batch = batch.transpose(1, 2)
+        batch = batch.reshape(batch.shape[0]*batch.shape[1], batch.shape[2], batch.shape[3], batch.shape[4])
+        outputBatch = self.resnet(batch)
+        outputBatch = outputBatch.reshape(batchsize, -1, 512)
+        outputBatch = outputBatch.transpose(1 ,2)
+        outputBatch = outputBatch.transpose(1, 2).transpose(0, 1)
+        return outputBatch
+class DSConv1d(nn.Module):
+    def __init__(self):
+        super(DSConv1d, self).__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Conv1d(512, 512, 3, stride=1, padding=1,dilation=1, groups=512, bias=False),
+            nn.PReLU(),
+            GlobalLayerNorm(512),
+            nn.Conv1d(512, 512, 1, bias=False),
+            )
+    def forward(self, x):
+        out = self.net(x)
+        return out + x
+class visualTCN(nn.Module):
+    def __init__(self):
+        super(visualTCN, self).__init__()
+        stacks = []
+        for x in range(5):
+            stacks += [DSConv1d()]
+        self.net = nn.Sequential(*stacks) # Visual Temporal Network V-TCN
+    def forward(self, x):
+        out = self.net(x)
+        return out
+class visualConv1D(nn.Module):
+    def __init__(self):
+        super(visualConv1D, self).__init__()
+        self.net = nn.Sequential(
+            nn.Conv1d(512, 256, 5, stride=1, padding=2),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Conv1d(256, 128, 1),
+            )
+    def forward(self, x):
+        out = self.net(x)
+        return out

talknet-asd/predict.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import cv2
+import json
+import glob
+import pickle
+import shutil
+import subprocess
+from typing import List, Optional
+from cog import BasePredictor, BaseModel, Input, Path
+class Output(BaseModel):
+    media_path: Optional[List[Path]]
+    json_str: Optional[str]
+class Predictor(BasePredictor):
+    def setup(self):
+        pass
+    def predict(
+        self,
+        video: Path = Input(description="Path to the video"),
+        face_det_scale: float = Input(
+            default=0.25,
+            description="Scale factor for face detection, the frames will be scaled to 0.25 of the original",
+            ge=0,
+            le=1,
+        ),
+        min_track: int = Input(
+            default=10, description="Number of min frames for each shot"
+        ),
+        num_failed_det: int = Input(
+            default=10,
+            description="Number of missed detections allowed before tracking is stopped",
+            ge=1,
+        ),
+        min_face_size: int = Input(
+            default=1, description="Minimum face size in pixels", ge=1
+        ),
+        crop_scale: float = Input(
+            default=0.40, description="Scale bounding box", ge=0, le=1
+        ),
+        start: int = Input(default=0, description="The start time of the video", ge=0),
+        duration: int = Input(
+            default=-1,
+            description="The duration of the video, when set as -1, will extract the whole video",
+        ),
+        return_json: bool = Input(
+            description="Return results in json format", default=True
+        ),
+        return_boundingbox_percentages: bool = Input(
+            description="Return bounding box coordinates as percentages of the video width and height",
+            default=False,
+        ),
+    ) -> Output:
+        video_path = str(video)
+        video_name = os.path.splitext(os.path.basename(video_path))[0]
+        video_folder = "demo"
+        # Clean up and create the video folder
+        shutil.rmtree(video_folder, ignore_errors=True)
+        os.makedirs(video_folder, exist_ok=True)
+        # Copy the input video to the video folder
+        target_video_path = os.path.join(video_folder, os.path.basename(video_path))
+        shutil.copy(video_path, target_video_path)
+        duration = max(0, duration)
+        n_data_loader_thread = 32
+        # Run the demoTalkNet.py script with the provided arguments
+        command = (
+            f"python demoTalkNet.py --videoName {video_name} "
+            f"--videoFolder {video_folder} "
+            f"--pretrainModel pretrain_TalkSet.model "
+            f"--nDataLoaderThread {n_data_loader_thread} "
+            f"--facedetScale {face_det_scale} "
+            f"--minTrack {min_track} "
+            f"--numFailedDet {num_failed_det} "
+            f"--minFaceSize {min_face_size} "
+            f"--cropScale {crop_scale} "
+            f"--start {start} "
+            f"--duration {duration} "
+        )
+        process = subprocess.Popen(
+            command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        print(f"Command output: {stdout.decode()}")
+        if stderr:
+            print(f"Command errors: {stderr.decode()}")
+        # Find the most recent pywork folder
+        pywork_folders = glob.glob(os.path.join(video_folder, "*", "pywork"))
+        latest_pywork_folder = max(pywork_folders, key=os.path.getctime)
+        # Load the face tracks and scores from the pickle files generated by demoTalkNet.py
+        tracks_file = os.path.join(latest_pywork_folder, "tracks.pckl")
+        scores_file = os.path.join(latest_pywork_folder, "scores.pckl")
+        with open(tracks_file, "rb") as f:
+            face_tracks = pickle.load(f)  # list
+        with open(scores_file, "rb") as f:
+            scores = pickle.load(f)  # list
+        # Get the video dimensions
+        video = cv2.VideoCapture(target_video_path)
+        video_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        video_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        video.release()
+        # Convert face tracks and scores to the desired JSON format
+        output_data = []
+        for track_idx, track in enumerate(face_tracks):
+            # Get the frame numbers for the current track
+            frames = track["track"]["frame"]
+            # Get the bounding box information for the current track
+            boxes = track["proc_track"]
+            # Get the speaking scores for the current track
+            # If the track index is out of range, use an empty list
+            speaking_scores = scores[track_idx] if track_idx < len(scores) else []
+            for i, frame in enumerate(frames):
+                # Check if the current index is within the valid range of the bounding box information
+                # If not, break the loop and move to the next track
+                if i >= len(boxes["x"]) or i >= len(boxes["y"]) or i >= len(boxes["s"]):
+                    break
+                # Calculate bounding box coordinates
+                x0 = int(boxes["x"][i] - boxes["s"][i])
+                y0 = int(boxes["y"][i] - boxes["s"][i])
+                x1 = int(boxes["x"][i] + boxes["s"][i])
+                y1 = int(boxes["y"][i] + boxes["s"][i])
+                # Normalize the bounding box coordinates if required
+                if return_boundingbox_percentages:
+                    x0 /= video_width
+                    y0 /= video_height
+                    x1 /= video_width
+                    y1 /= video_height
+                # Determine speaking status
+                speaking = (
+                    bool(speaking_scores[i] >= 0) if i < len(speaking_scores) else False
+                )
+                # Create the bounding box dictionary
+                box = {
+                    "face_id": track_idx,
+                    "x0": x0,
+                    "y0": y0,
+                    "x1": x1,
+                    "y1": y1,
+                    "speaking": speaking,
+                }
+                # Create a dictionary for each frame if it doesn't exist
+                frame_data = next(
+                    (
+                        data
+                        for data in output_data
+                        if data["frame_number"] == int(frame)
+                    ),
+                    None,
+                )
+                if frame_data is None:
+                    frame_data = {"frame_number": int(frame), "faces": []}
+                    output_data.append(frame_data)
+                # Add the current face's bounding box and speaking status to the frame's data
+                frame_data["faces"].append(box)
+        # Convert the output data to JSON string
+        json_str = json.dumps(output_data)
+        if return_json:
+            return Output(json_str=json_str)
+        else:
+            mp4_files = []
+            excluded_files = ["video_only.avi", "video.avi"]
+            avi_files = [
+                avi_file
+                for avi_file in Path(video_folder).rglob("*.avi")
+                if avi_file.name not in excluded_files
+            ]
+            for avi_file in avi_files:
+                mp4_file = avi_file.with_suffix(".mp4")
+                conversion_command = f"ffmpeg -i {avi_file} {mp4_file}"
+                conversion_process = subprocess.run(
+                    conversion_command,
+                    shell=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+                if conversion_process.returncode == 0:
+                    mp4_files.append(Path(mp4_file))
+            return Output(media_path=mp4_files)

talknet-asd/sanity_check.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

talknet-asd/talkNet.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys, time, numpy, os, subprocess, pandas, tqdm
+from loss import lossAV, lossA, lossV
+from model.talkNetModel import talkNetModel
+class talkNet(nn.Module):
+    def __init__(self, lr = 0.0001, lrDecay = 0.95, **kwargs):
+        super(talkNet, self).__init__()
+        self.model = talkNetModel().cuda()
+        self.lossAV = lossAV().cuda()
+        self.lossA = lossA().cuda()
+        self.lossV = lossV().cuda()
+        self.optim = torch.optim.Adam(self.parameters(), lr = lr)
+        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size = 1, gamma=lrDecay)
+        print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f"%(sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+    def train_network(self, loader, epoch, **kwargs):
+        self.train()
+        self.scheduler.step(epoch - 1)
+        index, top1, loss = 0, 0, 0
+        lr = self.optim.param_groups[0]['lr']
+        for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1):
+            self.zero_grad()
+            audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) # feedForward
+            visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
+            audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+            outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+            outsA = self.model.forward_audio_backend(audioEmbed)
+            outsV = self.model.forward_visual_backend(visualEmbed)
+            labels = labels[0].reshape((-1)).cuda() # Loss
+            nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels)
+            nlossA = self.lossA.forward(outsA, labels)
+            nlossV = self.lossV.forward(outsV, labels)
+            nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+            loss += nloss.detach().cpu().numpy()
+            top1 += prec
+            nloss.backward()
+            self.optim.step()
+            index += len(labels)
+            sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \
+            " [%2d] Lr: %5f, Training: %.2f%%, "    %(epoch, lr, 100 * (num / loader.__len__())) + \
+            " Loss: %.5f, ACC: %2.2f%% \r"        %(loss/(num), 100 * (top1/index)))
+            sys.stderr.flush()
+        sys.stdout.write("\n")
+        return loss/num, lr
+    def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs):
+        self.eval()
+        predScores = []
+        for audioFeature, visualFeature, labels in tqdm.tqdm(loader):
+            with torch.no_grad():
+                audioEmbed  = self.model.forward_audio_frontend(audioFeature[0].cuda())
+                visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+                outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+                labels = labels[0].reshape((-1)).cuda()
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels)
+                predScore = predScore[:,1].detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series( ['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1,inplace=True)
+        evalRes.drop(['instance_id'], axis=1,inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s "%(evalOrig, evalCsvSave)
+        mAP = float(str(subprocess.run(cmd, shell=True, capture_output =True).stdout).split(' ')[2][:5])
+        return mAP
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path)
+        for name, param in loadedState.items():
+            origName = name;
+            if name not in selfState:
+                name = name.replace("module.", "")
+                if name not in selfState:
+                    print("%s is not in the model."%origName)
+                    continue
+            if selfState[name].size() != loadedState[origName].size():
+                sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s"%(origName, selfState[name].size(), loadedState[origName].size()))
+                continue
+            selfState[name].copy_(param)