DimasMP3 commited on
Commit
5c69097
·
verified ·
1 Parent(s): deba98e

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. facelandmarker/face_landmarker.task +3 -0
  3. insightface/.gitattributes +30 -0
  4. insightface/README.md +22 -0
  5. insightface/models/arcface_r100_v1/model-0000.params +3 -0
  6. insightface/models/arcface_r100_v1/model-symbol.json +0 -0
  7. insightface/models/buffalo_l/1k3d68.onnx +3 -0
  8. insightface/models/buffalo_l/2d106det.onnx +3 -0
  9. insightface/models/buffalo_l/det_10g.onnx +3 -0
  10. insightface/models/buffalo_l/genderage.onnx +3 -0
  11. insightface/models/buffalo_l/w600k_r50.onnx +3 -0
  12. insightface/models/genderage_v1/model-0000.params +3 -0
  13. insightface/models/genderage_v1/model-symbol.json +2399 -0
  14. insightface/models/retinaface_r50_v1/R50-0000.params +3 -0
  15. insightface/models/retinaface_r50_v1/R50-symbol.json +0 -0
  16. insightface/models/scrfd_10g/model.pth +3 -0
  17. insightface/models/scrfd_10g_bnkps/model.pth +3 -0
  18. insightface/models/scrfd_1g/model.pth +3 -0
  19. insightface/models/scrfd_2.5g/model.pth +3 -0
  20. insightface/models/scrfd_2.5g_bnkps/model.pth +3 -0
  21. insightface/models/scrfd_34g/model.pth +3 -0
  22. insightface/models/scrfd_500m/model.pth +3 -0
  23. insightface/models/scrfd_500m_bnkps/model.pth +3 -0
  24. insightface/models/scrfd_person_2.5g.onnx +3 -0
  25. insightface/models/synthetic_resnet50d.ckpt +3 -0
  26. talknet-asd/.dockerignore +20 -0
  27. talknet-asd/.gitignore +118 -0
  28. talknet-asd/FAQ.md +54 -0
  29. talknet-asd/LICENSE.md +21 -0
  30. talknet-asd/README.md +146 -0
  31. talknet-asd/TalkSet/README.md +48 -0
  32. talknet-asd/TalkSet/generate_TalkSet.py +391 -0
  33. talknet-asd/awesomeASD.md +38 -0
  34. talknet-asd/cog.yaml +40 -0
  35. talknet-asd/dataLoader.py +143 -0
  36. talknet-asd/demoTalkNet.py +686 -0
  37. talknet-asd/export_onnx_cpu.py +87 -0
  38. talknet-asd/loss.py +50 -0
  39. talknet-asd/model/attentionLayer.py +36 -0
  40. talknet-asd/model/audioEncoder.py +108 -0
  41. talknet-asd/model/faceDetector/README.md +3 -0
  42. talknet-asd/model/faceDetector/__init__.py +1 -0
  43. talknet-asd/model/faceDetector/s3fd/__init__.py +66 -0
  44. talknet-asd/model/faceDetector/s3fd/box_utils.py +217 -0
  45. talknet-asd/model/faceDetector/s3fd/nets.py +174 -0
  46. talknet-asd/model/talkNetModel.py +64 -0
  47. talknet-asd/model/visualEncoder.py +172 -0
  48. talknet-asd/predict.py +201 -0
  49. talknet-asd/sanity_check.ipynb +0 -0
  50. talknet-asd/talkNet.py +94 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ facelandmarker/face_landmarker.task filter=lfs diff=lfs merge=lfs -text
37
+ insightface/models/arcface_r100_v1/model-0000.params filter=lfs diff=lfs merge=lfs -text
38
+ insightface/models/genderage_v1/model-0000.params filter=lfs diff=lfs merge=lfs -text
39
+ insightface/models/retinaface_r50_v1/R50-0000.params filter=lfs diff=lfs merge=lfs -text
40
+ talknet-asd/utils/overall.png filter=lfs diff=lfs merge=lfs -text
41
+ yolo-face-person-detector/images/image.png filter=lfs diff=lfs merge=lfs -text
42
+ yolo-face-person-detector/images/output.mp4 filter=lfs diff=lfs merge=lfs -text
facelandmarker/face_landmarker.task ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
3
+ size 3758596
insightface/.gitattributes ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.params filter=lfs diff=lfs merge=lfs -text
2
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
3
+ *.7z filter=lfs diff=lfs merge=lfs -text
4
+ *.arrow filter=lfs diff=lfs merge=lfs -text
5
+ *.bin filter=lfs diff=lfs merge=lfs -text
6
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
7
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
8
+ *.ftz filter=lfs diff=lfs merge=lfs -text
9
+ *.gz filter=lfs diff=lfs merge=lfs -text
10
+ *.h5 filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pt filter=lfs diff=lfs merge=lfs -text
20
+ *.pth filter=lfs diff=lfs merge=lfs -text
21
+ *.rar filter=lfs diff=lfs merge=lfs -text
22
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
23
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
24
+ *.tflite filter=lfs diff=lfs merge=lfs -text
25
+ *.tgz filter=lfs diff=lfs merge=lfs -text
26
+ *.wasm filter=lfs diff=lfs merge=lfs -text
27
+ *.xz filter=lfs diff=lfs merge=lfs -text
28
+ *.zip filter=lfs diff=lfs merge=lfs -text
29
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
30
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
insightface/README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # insightface
2
+
3
+ - https://github.com/deepinsight/insightface
4
+ - SCRFD
5
+ - https://github.com/deepinsight/insightface/tree/master/detection/scrfd
6
+ - https://1drv.ms/u/s!AswpsDO2toNKqyYWxScdiTITY4TQ?e=DjXof9
7
+ - https://1drv.ms/u/s!AswpsDO2toNKqyPVLI44ahNBsOMR?e=esPrBL
8
+ - https://1drv.ms/u/s!AswpsDO2toNKqyTIXnzB1ujPq4th?e=5t1VNv
9
+ - https://1drv.ms/u/s!AswpsDO2toNKqyUKwTiwXv2kaa8o?e=umfepO
10
+ - https://1drv.ms/u/s!AswpsDO2toNKqyKZwFebVlmlOvzz?e=V2rqUy
11
+ - https://1drv.ms/u/s!AswpsDO2toNKri_NDM0GIkPpkE2f?e=JkebJo
12
+ - https://1drv.ms/u/s!AswpsDO2toNKqyGlhxnCg3smyQqX?e=A6Hufm
13
+ - https://1drv.ms/u/s!AswpsDO2toNKqyGlhxnCg3smyQqX?e=A6Hufm
14
+ - Person Detection
15
+ - https://github.com/deepinsight/insightface/tree/master/examples/person_detection
16
+ - https://github.com/deepinsight/insightface/releases/download/v0.7/scrfd_person_2.5g.onnx
17
+ - Face Alignment (FaceSynthetics)
18
+ - https://github.com/deepinsight/insightface/tree/master/alignment/synthetics
19
+ - https://drive.google.com/file/d/1kNP7qEl3AYNbaHFUg_ZiyRB1CtfDWXR4/view?usp=sharing
20
+ - buffalo_l
21
+ - https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip
22
+
insightface/models/arcface_r100_v1/model-0000.params ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:931257c0b7174254fd81314706f2591cc6d1dd7299275bb8cf01c774ed0da8be
3
+ size 260958682
insightface/models/arcface_r100_v1/model-symbol.json ADDED
The diff for this file is too large to render. See raw diff
 
insightface/models/buffalo_l/1k3d68.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df5c06b8a0c12e422b2ed8947b8869faa4105387f199c477af038aa01f9a45cc
3
+ size 143607619
insightface/models/buffalo_l/2d106det.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f001b856447c413801ef5c42091ed0cd516fcd21f2d6b79635b1e733a7109dbf
3
+ size 5030888
insightface/models/buffalo_l/det_10g.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5838f7fe053675b1c7a08b633df49e7af5495cee0493c7dcf6697200b85b5b91
3
+ size 16923827
insightface/models/buffalo_l/genderage.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fde69b1c810857b88c64a335084f1c3fe8f01246c9a191b48c7bb756d6652fb
3
+ size 1322532
insightface/models/buffalo_l/w600k_r50.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c06341c33c2ca1f86781dab0e829f88ad5b64be9fba56e56bc9ebdefc619e43
3
+ size 174383860
insightface/models/genderage_v1/model-0000.params ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01788b7eaa2516636cbd976fad7883164aaeb0bd4027e878ff457f79fe9021aa
3
+ size 1100856
insightface/models/genderage_v1/model-symbol.json ADDED
@@ -0,0 +1,2399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nodes": [
3
+ {
4
+ "op": "null",
5
+ "name": "data",
6
+ "inputs": []
7
+ },
8
+ {
9
+ "op": "_minus_scalar",
10
+ "name": "_minusscalar0",
11
+ "attrs": {"scalar": "127.5"},
12
+ "inputs": [[0, 0, 0]]
13
+ },
14
+ {
15
+ "op": "_mul_scalar",
16
+ "name": "_mulscalar0",
17
+ "attrs": {"scalar": "0.0078125"},
18
+ "inputs": [[1, 0, 0]]
19
+ },
20
+ {
21
+ "op": "null",
22
+ "name": "conv_1_conv2d_weight",
23
+ "attrs": {
24
+ "kernel": "(3, 3)",
25
+ "no_bias": "True",
26
+ "num_filter": "8",
27
+ "num_group": "1",
28
+ "pad": "(1, 1)",
29
+ "stride": "(1, 1)"
30
+ },
31
+ "inputs": []
32
+ },
33
+ {
34
+ "op": "Convolution",
35
+ "name": "conv_1_conv2d",
36
+ "attrs": {
37
+ "kernel": "(3, 3)",
38
+ "no_bias": "True",
39
+ "num_filter": "8",
40
+ "num_group": "1",
41
+ "pad": "(1, 1)",
42
+ "stride": "(1, 1)"
43
+ },
44
+ "inputs": [[2, 0, 0], [3, 0, 0]]
45
+ },
46
+ {
47
+ "op": "null",
48
+ "name": "conv_1_batchnorm_gamma",
49
+ "attrs": {"fix_gamma": "True"},
50
+ "inputs": []
51
+ },
52
+ {
53
+ "op": "null",
54
+ "name": "conv_1_batchnorm_beta",
55
+ "attrs": {"fix_gamma": "True"},
56
+ "inputs": []
57
+ },
58
+ {
59
+ "op": "null",
60
+ "name": "conv_1_batchnorm_moving_mean",
61
+ "attrs": {
62
+ "__init__": "[\"zero\", {}]",
63
+ "fix_gamma": "True"
64
+ },
65
+ "inputs": []
66
+ },
67
+ {
68
+ "op": "null",
69
+ "name": "conv_1_batchnorm_moving_var",
70
+ "attrs": {
71
+ "__init__": "[\"one\", {}]",
72
+ "fix_gamma": "True"
73
+ },
74
+ "inputs": []
75
+ },
76
+ {
77
+ "op": "BatchNorm",
78
+ "name": "conv_1_batchnorm",
79
+ "attrs": {"fix_gamma": "True"},
80
+ "inputs": [[4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 1], [8, 0, 1]]
81
+ },
82
+ {
83
+ "op": "Activation",
84
+ "name": "conv_1_relu",
85
+ "attrs": {"act_type": "relu"},
86
+ "inputs": [[9, 0, 0]]
87
+ },
88
+ {
89
+ "op": "null",
90
+ "name": "conv_2_dw_conv2d_weight",
91
+ "attrs": {
92
+ "kernel": "(3, 3)",
93
+ "no_bias": "True",
94
+ "num_filter": "8",
95
+ "num_group": "8",
96
+ "pad": "(1, 1)",
97
+ "stride": "(1, 1)"
98
+ },
99
+ "inputs": []
100
+ },
101
+ {
102
+ "op": "Convolution",
103
+ "name": "conv_2_dw_conv2d",
104
+ "attrs": {
105
+ "kernel": "(3, 3)",
106
+ "no_bias": "True",
107
+ "num_filter": "8",
108
+ "num_group": "8",
109
+ "pad": "(1, 1)",
110
+ "stride": "(1, 1)"
111
+ },
112
+ "inputs": [[10, 0, 0], [11, 0, 0]]
113
+ },
114
+ {
115
+ "op": "null",
116
+ "name": "conv_2_dw_batchnorm_gamma",
117
+ "attrs": {"fix_gamma": "True"},
118
+ "inputs": []
119
+ },
120
+ {
121
+ "op": "null",
122
+ "name": "conv_2_dw_batchnorm_beta",
123
+ "attrs": {"fix_gamma": "True"},
124
+ "inputs": []
125
+ },
126
+ {
127
+ "op": "null",
128
+ "name": "conv_2_dw_batchnorm_moving_mean",
129
+ "attrs": {
130
+ "__init__": "[\"zero\", {}]",
131
+ "fix_gamma": "True"
132
+ },
133
+ "inputs": []
134
+ },
135
+ {
136
+ "op": "null",
137
+ "name": "conv_2_dw_batchnorm_moving_var",
138
+ "attrs": {
139
+ "__init__": "[\"one\", {}]",
140
+ "fix_gamma": "True"
141
+ },
142
+ "inputs": []
143
+ },
144
+ {
145
+ "op": "BatchNorm",
146
+ "name": "conv_2_dw_batchnorm",
147
+ "attrs": {"fix_gamma": "True"},
148
+ "inputs": [[12, 0, 0], [13, 0, 0], [14, 0, 0], [15, 0, 1], [16, 0, 1]]
149
+ },
150
+ {
151
+ "op": "Activation",
152
+ "name": "conv_2_dw_relu",
153
+ "attrs": {"act_type": "relu"},
154
+ "inputs": [[17, 0, 0]]
155
+ },
156
+ {
157
+ "op": "null",
158
+ "name": "conv_2_conv2d_weight",
159
+ "attrs": {
160
+ "kernel": "(1, 1)",
161
+ "no_bias": "True",
162
+ "num_filter": "16",
163
+ "num_group": "1",
164
+ "pad": "(0, 0)",
165
+ "stride": "(1, 1)"
166
+ },
167
+ "inputs": []
168
+ },
169
+ {
170
+ "op": "Convolution",
171
+ "name": "conv_2_conv2d",
172
+ "attrs": {
173
+ "kernel": "(1, 1)",
174
+ "no_bias": "True",
175
+ "num_filter": "16",
176
+ "num_group": "1",
177
+ "pad": "(0, 0)",
178
+ "stride": "(1, 1)"
179
+ },
180
+ "inputs": [[18, 0, 0], [19, 0, 0]]
181
+ },
182
+ {
183
+ "op": "null",
184
+ "name": "conv_2_batchnorm_gamma",
185
+ "attrs": {"fix_gamma": "True"},
186
+ "inputs": []
187
+ },
188
+ {
189
+ "op": "null",
190
+ "name": "conv_2_batchnorm_beta",
191
+ "attrs": {"fix_gamma": "True"},
192
+ "inputs": []
193
+ },
194
+ {
195
+ "op": "null",
196
+ "name": "conv_2_batchnorm_moving_mean",
197
+ "attrs": {
198
+ "__init__": "[\"zero\", {}]",
199
+ "fix_gamma": "True"
200
+ },
201
+ "inputs": []
202
+ },
203
+ {
204
+ "op": "null",
205
+ "name": "conv_2_batchnorm_moving_var",
206
+ "attrs": {
207
+ "__init__": "[\"one\", {}]",
208
+ "fix_gamma": "True"
209
+ },
210
+ "inputs": []
211
+ },
212
+ {
213
+ "op": "BatchNorm",
214
+ "name": "conv_2_batchnorm",
215
+ "attrs": {"fix_gamma": "True"},
216
+ "inputs": [[20, 0, 0], [21, 0, 0], [22, 0, 0], [23, 0, 1], [24, 0, 1]]
217
+ },
218
+ {
219
+ "op": "Activation",
220
+ "name": "conv_2_relu",
221
+ "attrs": {"act_type": "relu"},
222
+ "inputs": [[25, 0, 0]]
223
+ },
224
+ {
225
+ "op": "null",
226
+ "name": "conv_3_dw_conv2d_weight",
227
+ "attrs": {
228
+ "kernel": "(3, 3)",
229
+ "no_bias": "True",
230
+ "num_filter": "16",
231
+ "num_group": "16",
232
+ "pad": "(1, 1)",
233
+ "stride": "(2, 2)"
234
+ },
235
+ "inputs": []
236
+ },
237
+ {
238
+ "op": "Convolution",
239
+ "name": "conv_3_dw_conv2d",
240
+ "attrs": {
241
+ "kernel": "(3, 3)",
242
+ "no_bias": "True",
243
+ "num_filter": "16",
244
+ "num_group": "16",
245
+ "pad": "(1, 1)",
246
+ "stride": "(2, 2)"
247
+ },
248
+ "inputs": [[26, 0, 0], [27, 0, 0]]
249
+ },
250
+ {
251
+ "op": "null",
252
+ "name": "conv_3_dw_batchnorm_gamma",
253
+ "attrs": {"fix_gamma": "True"},
254
+ "inputs": []
255
+ },
256
+ {
257
+ "op": "null",
258
+ "name": "conv_3_dw_batchnorm_beta",
259
+ "attrs": {"fix_gamma": "True"},
260
+ "inputs": []
261
+ },
262
+ {
263
+ "op": "null",
264
+ "name": "conv_3_dw_batchnorm_moving_mean",
265
+ "attrs": {
266
+ "__init__": "[\"zero\", {}]",
267
+ "fix_gamma": "True"
268
+ },
269
+ "inputs": []
270
+ },
271
+ {
272
+ "op": "null",
273
+ "name": "conv_3_dw_batchnorm_moving_var",
274
+ "attrs": {
275
+ "__init__": "[\"one\", {}]",
276
+ "fix_gamma": "True"
277
+ },
278
+ "inputs": []
279
+ },
280
+ {
281
+ "op": "BatchNorm",
282
+ "name": "conv_3_dw_batchnorm",
283
+ "attrs": {"fix_gamma": "True"},
284
+ "inputs": [[28, 0, 0], [29, 0, 0], [30, 0, 0], [31, 0, 1], [32, 0, 1]]
285
+ },
286
+ {
287
+ "op": "Activation",
288
+ "name": "conv_3_dw_relu",
289
+ "attrs": {"act_type": "relu"},
290
+ "inputs": [[33, 0, 0]]
291
+ },
292
+ {
293
+ "op": "null",
294
+ "name": "conv_3_conv2d_weight",
295
+ "attrs": {
296
+ "kernel": "(1, 1)",
297
+ "no_bias": "True",
298
+ "num_filter": "32",
299
+ "num_group": "1",
300
+ "pad": "(0, 0)",
301
+ "stride": "(1, 1)"
302
+ },
303
+ "inputs": []
304
+ },
305
+ {
306
+ "op": "Convolution",
307
+ "name": "conv_3_conv2d",
308
+ "attrs": {
309
+ "kernel": "(1, 1)",
310
+ "no_bias": "True",
311
+ "num_filter": "32",
312
+ "num_group": "1",
313
+ "pad": "(0, 0)",
314
+ "stride": "(1, 1)"
315
+ },
316
+ "inputs": [[34, 0, 0], [35, 0, 0]]
317
+ },
318
+ {
319
+ "op": "null",
320
+ "name": "conv_3_batchnorm_gamma",
321
+ "attrs": {"fix_gamma": "True"},
322
+ "inputs": []
323
+ },
324
+ {
325
+ "op": "null",
326
+ "name": "conv_3_batchnorm_beta",
327
+ "attrs": {"fix_gamma": "True"},
328
+ "inputs": []
329
+ },
330
+ {
331
+ "op": "null",
332
+ "name": "conv_3_batchnorm_moving_mean",
333
+ "attrs": {
334
+ "__init__": "[\"zero\", {}]",
335
+ "fix_gamma": "True"
336
+ },
337
+ "inputs": []
338
+ },
339
+ {
340
+ "op": "null",
341
+ "name": "conv_3_batchnorm_moving_var",
342
+ "attrs": {
343
+ "__init__": "[\"one\", {}]",
344
+ "fix_gamma": "True"
345
+ },
346
+ "inputs": []
347
+ },
348
+ {
349
+ "op": "BatchNorm",
350
+ "name": "conv_3_batchnorm",
351
+ "attrs": {"fix_gamma": "True"},
352
+ "inputs": [[36, 0, 0], [37, 0, 0], [38, 0, 0], [39, 0, 1], [40, 0, 1]]
353
+ },
354
+ {
355
+ "op": "Activation",
356
+ "name": "conv_3_relu",
357
+ "attrs": {"act_type": "relu"},
358
+ "inputs": [[41, 0, 0]]
359
+ },
360
+ {
361
+ "op": "null",
362
+ "name": "conv_4_dw_conv2d_weight",
363
+ "attrs": {
364
+ "kernel": "(3, 3)",
365
+ "no_bias": "True",
366
+ "num_filter": "32",
367
+ "num_group": "32",
368
+ "pad": "(1, 1)",
369
+ "stride": "(1, 1)"
370
+ },
371
+ "inputs": []
372
+ },
373
+ {
374
+ "op": "Convolution",
375
+ "name": "conv_4_dw_conv2d",
376
+ "attrs": {
377
+ "kernel": "(3, 3)",
378
+ "no_bias": "True",
379
+ "num_filter": "32",
380
+ "num_group": "32",
381
+ "pad": "(1, 1)",
382
+ "stride": "(1, 1)"
383
+ },
384
+ "inputs": [[42, 0, 0], [43, 0, 0]]
385
+ },
386
+ {
387
+ "op": "null",
388
+ "name": "conv_4_dw_batchnorm_gamma",
389
+ "attrs": {"fix_gamma": "True"},
390
+ "inputs": []
391
+ },
392
+ {
393
+ "op": "null",
394
+ "name": "conv_4_dw_batchnorm_beta",
395
+ "attrs": {"fix_gamma": "True"},
396
+ "inputs": []
397
+ },
398
+ {
399
+ "op": "null",
400
+ "name": "conv_4_dw_batchnorm_moving_mean",
401
+ "attrs": {
402
+ "__init__": "[\"zero\", {}]",
403
+ "fix_gamma": "True"
404
+ },
405
+ "inputs": []
406
+ },
407
+ {
408
+ "op": "null",
409
+ "name": "conv_4_dw_batchnorm_moving_var",
410
+ "attrs": {
411
+ "__init__": "[\"one\", {}]",
412
+ "fix_gamma": "True"
413
+ },
414
+ "inputs": []
415
+ },
416
+ {
417
+ "op": "BatchNorm",
418
+ "name": "conv_4_dw_batchnorm",
419
+ "attrs": {"fix_gamma": "True"},
420
+ "inputs": [[44, 0, 0], [45, 0, 0], [46, 0, 0], [47, 0, 1], [48, 0, 1]]
421
+ },
422
+ {
423
+ "op": "Activation",
424
+ "name": "conv_4_dw_relu",
425
+ "attrs": {"act_type": "relu"},
426
+ "inputs": [[49, 0, 0]]
427
+ },
428
+ {
429
+ "op": "null",
430
+ "name": "conv_4_conv2d_weight",
431
+ "attrs": {
432
+ "kernel": "(1, 1)",
433
+ "no_bias": "True",
434
+ "num_filter": "32",
435
+ "num_group": "1",
436
+ "pad": "(0, 0)",
437
+ "stride": "(1, 1)"
438
+ },
439
+ "inputs": []
440
+ },
441
+ {
442
+ "op": "Convolution",
443
+ "name": "conv_4_conv2d",
444
+ "attrs": {
445
+ "kernel": "(1, 1)",
446
+ "no_bias": "True",
447
+ "num_filter": "32",
448
+ "num_group": "1",
449
+ "pad": "(0, 0)",
450
+ "stride": "(1, 1)"
451
+ },
452
+ "inputs": [[50, 0, 0], [51, 0, 0]]
453
+ },
454
+ {
455
+ "op": "null",
456
+ "name": "conv_4_batchnorm_gamma",
457
+ "attrs": {"fix_gamma": "True"},
458
+ "inputs": []
459
+ },
460
+ {
461
+ "op": "null",
462
+ "name": "conv_4_batchnorm_beta",
463
+ "attrs": {"fix_gamma": "True"},
464
+ "inputs": []
465
+ },
466
+ {
467
+ "op": "null",
468
+ "name": "conv_4_batchnorm_moving_mean",
469
+ "attrs": {
470
+ "__init__": "[\"zero\", {}]",
471
+ "fix_gamma": "True"
472
+ },
473
+ "inputs": []
474
+ },
475
+ {
476
+ "op": "null",
477
+ "name": "conv_4_batchnorm_moving_var",
478
+ "attrs": {
479
+ "__init__": "[\"one\", {}]",
480
+ "fix_gamma": "True"
481
+ },
482
+ "inputs": []
483
+ },
484
+ {
485
+ "op": "BatchNorm",
486
+ "name": "conv_4_batchnorm",
487
+ "attrs": {"fix_gamma": "True"},
488
+ "inputs": [[52, 0, 0], [53, 0, 0], [54, 0, 0], [55, 0, 1], [56, 0, 1]]
489
+ },
490
+ {
491
+ "op": "Activation",
492
+ "name": "conv_4_relu",
493
+ "attrs": {"act_type": "relu"},
494
+ "inputs": [[57, 0, 0]]
495
+ },
496
+ {
497
+ "op": "null",
498
+ "name": "conv_5_dw_conv2d_weight",
499
+ "attrs": {
500
+ "kernel": "(3, 3)",
501
+ "no_bias": "True",
502
+ "num_filter": "32",
503
+ "num_group": "32",
504
+ "pad": "(1, 1)",
505
+ "stride": "(2, 2)"
506
+ },
507
+ "inputs": []
508
+ },
509
+ {
510
+ "op": "Convolution",
511
+ "name": "conv_5_dw_conv2d",
512
+ "attrs": {
513
+ "kernel": "(3, 3)",
514
+ "no_bias": "True",
515
+ "num_filter": "32",
516
+ "num_group": "32",
517
+ "pad": "(1, 1)",
518
+ "stride": "(2, 2)"
519
+ },
520
+ "inputs": [[58, 0, 0], [59, 0, 0]]
521
+ },
522
+ {
523
+ "op": "null",
524
+ "name": "conv_5_dw_batchnorm_gamma",
525
+ "attrs": {"fix_gamma": "True"},
526
+ "inputs": []
527
+ },
528
+ {
529
+ "op": "null",
530
+ "name": "conv_5_dw_batchnorm_beta",
531
+ "attrs": {"fix_gamma": "True"},
532
+ "inputs": []
533
+ },
534
+ {
535
+ "op": "null",
536
+ "name": "conv_5_dw_batchnorm_moving_mean",
537
+ "attrs": {
538
+ "__init__": "[\"zero\", {}]",
539
+ "fix_gamma": "True"
540
+ },
541
+ "inputs": []
542
+ },
543
+ {
544
+ "op": "null",
545
+ "name": "conv_5_dw_batchnorm_moving_var",
546
+ "attrs": {
547
+ "__init__": "[\"one\", {}]",
548
+ "fix_gamma": "True"
549
+ },
550
+ "inputs": []
551
+ },
552
+ {
553
+ "op": "BatchNorm",
554
+ "name": "conv_5_dw_batchnorm",
555
+ "attrs": {"fix_gamma": "True"},
556
+ "inputs": [[60, 0, 0], [61, 0, 0], [62, 0, 0], [63, 0, 1], [64, 0, 1]]
557
+ },
558
+ {
559
+ "op": "Activation",
560
+ "name": "conv_5_dw_relu",
561
+ "attrs": {"act_type": "relu"},
562
+ "inputs": [[65, 0, 0]]
563
+ },
564
+ {
565
+ "op": "null",
566
+ "name": "conv_5_conv2d_weight",
567
+ "attrs": {
568
+ "kernel": "(1, 1)",
569
+ "no_bias": "True",
570
+ "num_filter": "64",
571
+ "num_group": "1",
572
+ "pad": "(0, 0)",
573
+ "stride": "(1, 1)"
574
+ },
575
+ "inputs": []
576
+ },
577
+ {
578
+ "op": "Convolution",
579
+ "name": "conv_5_conv2d",
580
+ "attrs": {
581
+ "kernel": "(1, 1)",
582
+ "no_bias": "True",
583
+ "num_filter": "64",
584
+ "num_group": "1",
585
+ "pad": "(0, 0)",
586
+ "stride": "(1, 1)"
587
+ },
588
+ "inputs": [[66, 0, 0], [67, 0, 0]]
589
+ },
590
+ {
591
+ "op": "null",
592
+ "name": "conv_5_batchnorm_gamma",
593
+ "attrs": {"fix_gamma": "True"},
594
+ "inputs": []
595
+ },
596
+ {
597
+ "op": "null",
598
+ "name": "conv_5_batchnorm_beta",
599
+ "attrs": {"fix_gamma": "True"},
600
+ "inputs": []
601
+ },
602
+ {
603
+ "op": "null",
604
+ "name": "conv_5_batchnorm_moving_mean",
605
+ "attrs": {
606
+ "__init__": "[\"zero\", {}]",
607
+ "fix_gamma": "True"
608
+ },
609
+ "inputs": []
610
+ },
611
+ {
612
+ "op": "null",
613
+ "name": "conv_5_batchnorm_moving_var",
614
+ "attrs": {
615
+ "__init__": "[\"one\", {}]",
616
+ "fix_gamma": "True"
617
+ },
618
+ "inputs": []
619
+ },
620
+ {
621
+ "op": "BatchNorm",
622
+ "name": "conv_5_batchnorm",
623
+ "attrs": {"fix_gamma": "True"},
624
+ "inputs": [[68, 0, 0], [69, 0, 0], [70, 0, 0], [71, 0, 1], [72, 0, 1]]
625
+ },
626
+ {
627
+ "op": "Activation",
628
+ "name": "conv_5_relu",
629
+ "attrs": {"act_type": "relu"},
630
+ "inputs": [[73, 0, 0]]
631
+ },
632
+ {
633
+ "op": "null",
634
+ "name": "conv_6_dw_conv2d_weight",
635
+ "attrs": {
636
+ "kernel": "(3, 3)",
637
+ "no_bias": "True",
638
+ "num_filter": "64",
639
+ "num_group": "64",
640
+ "pad": "(1, 1)",
641
+ "stride": "(1, 1)"
642
+ },
643
+ "inputs": []
644
+ },
645
+ {
646
+ "op": "Convolution",
647
+ "name": "conv_6_dw_conv2d",
648
+ "attrs": {
649
+ "kernel": "(3, 3)",
650
+ "no_bias": "True",
651
+ "num_filter": "64",
652
+ "num_group": "64",
653
+ "pad": "(1, 1)",
654
+ "stride": "(1, 1)"
655
+ },
656
+ "inputs": [[74, 0, 0], [75, 0, 0]]
657
+ },
658
+ {
659
+ "op": "null",
660
+ "name": "conv_6_dw_batchnorm_gamma",
661
+ "attrs": {"fix_gamma": "True"},
662
+ "inputs": []
663
+ },
664
+ {
665
+ "op": "null",
666
+ "name": "conv_6_dw_batchnorm_beta",
667
+ "attrs": {"fix_gamma": "True"},
668
+ "inputs": []
669
+ },
670
+ {
671
+ "op": "null",
672
+ "name": "conv_6_dw_batchnorm_moving_mean",
673
+ "attrs": {
674
+ "__init__": "[\"zero\", {}]",
675
+ "fix_gamma": "True"
676
+ },
677
+ "inputs": []
678
+ },
679
+ {
680
+ "op": "null",
681
+ "name": "conv_6_dw_batchnorm_moving_var",
682
+ "attrs": {
683
+ "__init__": "[\"one\", {}]",
684
+ "fix_gamma": "True"
685
+ },
686
+ "inputs": []
687
+ },
688
+ {
689
+ "op": "BatchNorm",
690
+ "name": "conv_6_dw_batchnorm",
691
+ "attrs": {"fix_gamma": "True"},
692
+ "inputs": [[76, 0, 0], [77, 0, 0], [78, 0, 0], [79, 0, 1], [80, 0, 1]]
693
+ },
694
+ {
695
+ "op": "Activation",
696
+ "name": "conv_6_dw_relu",
697
+ "attrs": {"act_type": "relu"},
698
+ "inputs": [[81, 0, 0]]
699
+ },
700
+ {
701
+ "op": "null",
702
+ "name": "conv_6_conv2d_weight",
703
+ "attrs": {
704
+ "kernel": "(1, 1)",
705
+ "no_bias": "True",
706
+ "num_filter": "64",
707
+ "num_group": "1",
708
+ "pad": "(0, 0)",
709
+ "stride": "(1, 1)"
710
+ },
711
+ "inputs": []
712
+ },
713
+ {
714
+ "op": "Convolution",
715
+ "name": "conv_6_conv2d",
716
+ "attrs": {
717
+ "kernel": "(1, 1)",
718
+ "no_bias": "True",
719
+ "num_filter": "64",
720
+ "num_group": "1",
721
+ "pad": "(0, 0)",
722
+ "stride": "(1, 1)"
723
+ },
724
+ "inputs": [[82, 0, 0], [83, 0, 0]]
725
+ },
726
+ {
727
+ "op": "null",
728
+ "name": "conv_6_batchnorm_gamma",
729
+ "attrs": {"fix_gamma": "True"},
730
+ "inputs": []
731
+ },
732
+ {
733
+ "op": "null",
734
+ "name": "conv_6_batchnorm_beta",
735
+ "attrs": {"fix_gamma": "True"},
736
+ "inputs": []
737
+ },
738
+ {
739
+ "op": "null",
740
+ "name": "conv_6_batchnorm_moving_mean",
741
+ "attrs": {
742
+ "__init__": "[\"zero\", {}]",
743
+ "fix_gamma": "True"
744
+ },
745
+ "inputs": []
746
+ },
747
+ {
748
+ "op": "null",
749
+ "name": "conv_6_batchnorm_moving_var",
750
+ "attrs": {
751
+ "__init__": "[\"one\", {}]",
752
+ "fix_gamma": "True"
753
+ },
754
+ "inputs": []
755
+ },
756
+ {
757
+ "op": "BatchNorm",
758
+ "name": "conv_6_batchnorm",
759
+ "attrs": {"fix_gamma": "True"},
760
+ "inputs": [[84, 0, 0], [85, 0, 0], [86, 0, 0], [87, 0, 1], [88, 0, 1]]
761
+ },
762
+ {
763
+ "op": "Activation",
764
+ "name": "conv_6_relu",
765
+ "attrs": {"act_type": "relu"},
766
+ "inputs": [[89, 0, 0]]
767
+ },
768
+ {
769
+ "op": "null",
770
+ "name": "conv_7_dw_conv2d_weight",
771
+ "attrs": {
772
+ "kernel": "(3, 3)",
773
+ "no_bias": "True",
774
+ "num_filter": "64",
775
+ "num_group": "64",
776
+ "pad": "(1, 1)",
777
+ "stride": "(2, 2)"
778
+ },
779
+ "inputs": []
780
+ },
781
+ {
782
+ "op": "Convolution",
783
+ "name": "conv_7_dw_conv2d",
784
+ "attrs": {
785
+ "kernel": "(3, 3)",
786
+ "no_bias": "True",
787
+ "num_filter": "64",
788
+ "num_group": "64",
789
+ "pad": "(1, 1)",
790
+ "stride": "(2, 2)"
791
+ },
792
+ "inputs": [[90, 0, 0], [91, 0, 0]]
793
+ },
794
+ {
795
+ "op": "null",
796
+ "name": "conv_7_dw_batchnorm_gamma",
797
+ "attrs": {"fix_gamma": "True"},
798
+ "inputs": []
799
+ },
800
+ {
801
+ "op": "null",
802
+ "name": "conv_7_dw_batchnorm_beta",
803
+ "attrs": {"fix_gamma": "True"},
804
+ "inputs": []
805
+ },
806
+ {
807
+ "op": "null",
808
+ "name": "conv_7_dw_batchnorm_moving_mean",
809
+ "attrs": {
810
+ "__init__": "[\"zero\", {}]",
811
+ "fix_gamma": "True"
812
+ },
813
+ "inputs": []
814
+ },
815
+ {
816
+ "op": "null",
817
+ "name": "conv_7_dw_batchnorm_moving_var",
818
+ "attrs": {
819
+ "__init__": "[\"one\", {}]",
820
+ "fix_gamma": "True"
821
+ },
822
+ "inputs": []
823
+ },
824
+ {
825
+ "op": "BatchNorm",
826
+ "name": "conv_7_dw_batchnorm",
827
+ "attrs": {"fix_gamma": "True"},
828
+ "inputs": [[92, 0, 0], [93, 0, 0], [94, 0, 0], [95, 0, 1], [96, 0, 1]]
829
+ },
830
+ {
831
+ "op": "Activation",
832
+ "name": "conv_7_dw_relu",
833
+ "attrs": {"act_type": "relu"},
834
+ "inputs": [[97, 0, 0]]
835
+ },
836
+ {
837
+ "op": "null",
838
+ "name": "conv_7_conv2d_weight",
839
+ "attrs": {
840
+ "kernel": "(1, 1)",
841
+ "no_bias": "True",
842
+ "num_filter": "128",
843
+ "num_group": "1",
844
+ "pad": "(0, 0)",
845
+ "stride": "(1, 1)"
846
+ },
847
+ "inputs": []
848
+ },
849
+ {
850
+ "op": "Convolution",
851
+ "name": "conv_7_conv2d",
852
+ "attrs": {
853
+ "kernel": "(1, 1)",
854
+ "no_bias": "True",
855
+ "num_filter": "128",
856
+ "num_group": "1",
857
+ "pad": "(0, 0)",
858
+ "stride": "(1, 1)"
859
+ },
860
+ "inputs": [[98, 0, 0], [99, 0, 0]]
861
+ },
862
+ {
863
+ "op": "null",
864
+ "name": "conv_7_batchnorm_gamma",
865
+ "attrs": {"fix_gamma": "True"},
866
+ "inputs": []
867
+ },
868
+ {
869
+ "op": "null",
870
+ "name": "conv_7_batchnorm_beta",
871
+ "attrs": {"fix_gamma": "True"},
872
+ "inputs": []
873
+ },
874
+ {
875
+ "op": "null",
876
+ "name": "conv_7_batchnorm_moving_mean",
877
+ "attrs": {
878
+ "__init__": "[\"zero\", {}]",
879
+ "fix_gamma": "True"
880
+ },
881
+ "inputs": []
882
+ },
883
+ {
884
+ "op": "null",
885
+ "name": "conv_7_batchnorm_moving_var",
886
+ "attrs": {
887
+ "__init__": "[\"one\", {}]",
888
+ "fix_gamma": "True"
889
+ },
890
+ "inputs": []
891
+ },
892
+ {
893
+ "op": "BatchNorm",
894
+ "name": "conv_7_batchnorm",
895
+ "attrs": {"fix_gamma": "True"},
896
+ "inputs": [[100, 0, 0], [101, 0, 0], [102, 0, 0], [103, 0, 1], [104, 0, 1]]
897
+ },
898
+ {
899
+ "op": "Activation",
900
+ "name": "conv_7_relu",
901
+ "attrs": {"act_type": "relu"},
902
+ "inputs": [[105, 0, 0]]
903
+ },
904
+ {
905
+ "op": "null",
906
+ "name": "conv_8_dw_conv2d_weight",
907
+ "attrs": {
908
+ "kernel": "(3, 3)",
909
+ "no_bias": "True",
910
+ "num_filter": "128",
911
+ "num_group": "128",
912
+ "pad": "(1, 1)",
913
+ "stride": "(1, 1)"
914
+ },
915
+ "inputs": []
916
+ },
917
+ {
918
+ "op": "Convolution",
919
+ "name": "conv_8_dw_conv2d",
920
+ "attrs": {
921
+ "kernel": "(3, 3)",
922
+ "no_bias": "True",
923
+ "num_filter": "128",
924
+ "num_group": "128",
925
+ "pad": "(1, 1)",
926
+ "stride": "(1, 1)"
927
+ },
928
+ "inputs": [[106, 0, 0], [107, 0, 0]]
929
+ },
930
+ {
931
+ "op": "null",
932
+ "name": "conv_8_dw_batchnorm_gamma",
933
+ "attrs": {"fix_gamma": "True"},
934
+ "inputs": []
935
+ },
936
+ {
937
+ "op": "null",
938
+ "name": "conv_8_dw_batchnorm_beta",
939
+ "attrs": {"fix_gamma": "True"},
940
+ "inputs": []
941
+ },
942
+ {
943
+ "op": "null",
944
+ "name": "conv_8_dw_batchnorm_moving_mean",
945
+ "attrs": {
946
+ "__init__": "[\"zero\", {}]",
947
+ "fix_gamma": "True"
948
+ },
949
+ "inputs": []
950
+ },
951
+ {
952
+ "op": "null",
953
+ "name": "conv_8_dw_batchnorm_moving_var",
954
+ "attrs": {
955
+ "__init__": "[\"one\", {}]",
956
+ "fix_gamma": "True"
957
+ },
958
+ "inputs": []
959
+ },
960
+ {
961
+ "op": "BatchNorm",
962
+ "name": "conv_8_dw_batchnorm",
963
+ "attrs": {"fix_gamma": "True"},
964
+ "inputs": [[108, 0, 0], [109, 0, 0], [110, 0, 0], [111, 0, 1], [112, 0, 1]]
965
+ },
966
+ {
967
+ "op": "Activation",
968
+ "name": "conv_8_dw_relu",
969
+ "attrs": {"act_type": "relu"},
970
+ "inputs": [[113, 0, 0]]
971
+ },
972
+ {
973
+ "op": "null",
974
+ "name": "conv_8_conv2d_weight",
975
+ "attrs": {
976
+ "kernel": "(1, 1)",
977
+ "no_bias": "True",
978
+ "num_filter": "128",
979
+ "num_group": "1",
980
+ "pad": "(0, 0)",
981
+ "stride": "(1, 1)"
982
+ },
983
+ "inputs": []
984
+ },
985
+ {
986
+ "op": "Convolution",
987
+ "name": "conv_8_conv2d",
988
+ "attrs": {
989
+ "kernel": "(1, 1)",
990
+ "no_bias": "True",
991
+ "num_filter": "128",
992
+ "num_group": "1",
993
+ "pad": "(0, 0)",
994
+ "stride": "(1, 1)"
995
+ },
996
+ "inputs": [[114, 0, 0], [115, 0, 0]]
997
+ },
998
+ {
999
+ "op": "null",
1000
+ "name": "conv_8_batchnorm_gamma",
1001
+ "attrs": {"fix_gamma": "True"},
1002
+ "inputs": []
1003
+ },
1004
+ {
1005
+ "op": "null",
1006
+ "name": "conv_8_batchnorm_beta",
1007
+ "attrs": {"fix_gamma": "True"},
1008
+ "inputs": []
1009
+ },
1010
+ {
1011
+ "op": "null",
1012
+ "name": "conv_8_batchnorm_moving_mean",
1013
+ "attrs": {
1014
+ "__init__": "[\"zero\", {}]",
1015
+ "fix_gamma": "True"
1016
+ },
1017
+ "inputs": []
1018
+ },
1019
+ {
1020
+ "op": "null",
1021
+ "name": "conv_8_batchnorm_moving_var",
1022
+ "attrs": {
1023
+ "__init__": "[\"one\", {}]",
1024
+ "fix_gamma": "True"
1025
+ },
1026
+ "inputs": []
1027
+ },
1028
+ {
1029
+ "op": "BatchNorm",
1030
+ "name": "conv_8_batchnorm",
1031
+ "attrs": {"fix_gamma": "True"},
1032
+ "inputs": [[116, 0, 0], [117, 0, 0], [118, 0, 0], [119, 0, 1], [120, 0, 1]]
1033
+ },
1034
+ {
1035
+ "op": "Activation",
1036
+ "name": "conv_8_relu",
1037
+ "attrs": {"act_type": "relu"},
1038
+ "inputs": [[121, 0, 0]]
1039
+ },
1040
+ {
1041
+ "op": "null",
1042
+ "name": "conv_9_dw_conv2d_weight",
1043
+ "attrs": {
1044
+ "kernel": "(3, 3)",
1045
+ "no_bias": "True",
1046
+ "num_filter": "128",
1047
+ "num_group": "128",
1048
+ "pad": "(1, 1)",
1049
+ "stride": "(1, 1)"
1050
+ },
1051
+ "inputs": []
1052
+ },
1053
+ {
1054
+ "op": "Convolution",
1055
+ "name": "conv_9_dw_conv2d",
1056
+ "attrs": {
1057
+ "kernel": "(3, 3)",
1058
+ "no_bias": "True",
1059
+ "num_filter": "128",
1060
+ "num_group": "128",
1061
+ "pad": "(1, 1)",
1062
+ "stride": "(1, 1)"
1063
+ },
1064
+ "inputs": [[122, 0, 0], [123, 0, 0]]
1065
+ },
1066
+ {
1067
+ "op": "null",
1068
+ "name": "conv_9_dw_batchnorm_gamma",
1069
+ "attrs": {"fix_gamma": "True"},
1070
+ "inputs": []
1071
+ },
1072
+ {
1073
+ "op": "null",
1074
+ "name": "conv_9_dw_batchnorm_beta",
1075
+ "attrs": {"fix_gamma": "True"},
1076
+ "inputs": []
1077
+ },
1078
+ {
1079
+ "op": "null",
1080
+ "name": "conv_9_dw_batchnorm_moving_mean",
1081
+ "attrs": {
1082
+ "__init__": "[\"zero\", {}]",
1083
+ "fix_gamma": "True"
1084
+ },
1085
+ "inputs": []
1086
+ },
1087
+ {
1088
+ "op": "null",
1089
+ "name": "conv_9_dw_batchnorm_moving_var",
1090
+ "attrs": {
1091
+ "__init__": "[\"one\", {}]",
1092
+ "fix_gamma": "True"
1093
+ },
1094
+ "inputs": []
1095
+ },
1096
+ {
1097
+ "op": "BatchNorm",
1098
+ "name": "conv_9_dw_batchnorm",
1099
+ "attrs": {"fix_gamma": "True"},
1100
+ "inputs": [[124, 0, 0], [125, 0, 0], [126, 0, 0], [127, 0, 1], [128, 0, 1]]
1101
+ },
1102
+ {
1103
+ "op": "Activation",
1104
+ "name": "conv_9_dw_relu",
1105
+ "attrs": {"act_type": "relu"},
1106
+ "inputs": [[129, 0, 0]]
1107
+ },
1108
+ {
1109
+ "op": "null",
1110
+ "name": "conv_9_conv2d_weight",
1111
+ "attrs": {
1112
+ "kernel": "(1, 1)",
1113
+ "no_bias": "True",
1114
+ "num_filter": "128",
1115
+ "num_group": "1",
1116
+ "pad": "(0, 0)",
1117
+ "stride": "(1, 1)"
1118
+ },
1119
+ "inputs": []
1120
+ },
1121
+ {
1122
+ "op": "Convolution",
1123
+ "name": "conv_9_conv2d",
1124
+ "attrs": {
1125
+ "kernel": "(1, 1)",
1126
+ "no_bias": "True",
1127
+ "num_filter": "128",
1128
+ "num_group": "1",
1129
+ "pad": "(0, 0)",
1130
+ "stride": "(1, 1)"
1131
+ },
1132
+ "inputs": [[130, 0, 0], [131, 0, 0]]
1133
+ },
1134
+ {
1135
+ "op": "null",
1136
+ "name": "conv_9_batchnorm_gamma",
1137
+ "attrs": {"fix_gamma": "True"},
1138
+ "inputs": []
1139
+ },
1140
+ {
1141
+ "op": "null",
1142
+ "name": "conv_9_batchnorm_beta",
1143
+ "attrs": {"fix_gamma": "True"},
1144
+ "inputs": []
1145
+ },
1146
+ {
1147
+ "op": "null",
1148
+ "name": "conv_9_batchnorm_moving_mean",
1149
+ "attrs": {
1150
+ "__init__": "[\"zero\", {}]",
1151
+ "fix_gamma": "True"
1152
+ },
1153
+ "inputs": []
1154
+ },
1155
+ {
1156
+ "op": "null",
1157
+ "name": "conv_9_batchnorm_moving_var",
1158
+ "attrs": {
1159
+ "__init__": "[\"one\", {}]",
1160
+ "fix_gamma": "True"
1161
+ },
1162
+ "inputs": []
1163
+ },
1164
+ {
1165
+ "op": "BatchNorm",
1166
+ "name": "conv_9_batchnorm",
1167
+ "attrs": {"fix_gamma": "True"},
1168
+ "inputs": [[132, 0, 0], [133, 0, 0], [134, 0, 0], [135, 0, 1], [136, 0, 1]]
1169
+ },
1170
+ {
1171
+ "op": "Activation",
1172
+ "name": "conv_9_relu",
1173
+ "attrs": {"act_type": "relu"},
1174
+ "inputs": [[137, 0, 0]]
1175
+ },
1176
+ {
1177
+ "op": "null",
1178
+ "name": "conv_10_dw_conv2d_weight",
1179
+ "attrs": {
1180
+ "kernel": "(3, 3)",
1181
+ "no_bias": "True",
1182
+ "num_filter": "128",
1183
+ "num_group": "128",
1184
+ "pad": "(1, 1)",
1185
+ "stride": "(1, 1)"
1186
+ },
1187
+ "inputs": []
1188
+ },
1189
+ {
1190
+ "op": "Convolution",
1191
+ "name": "conv_10_dw_conv2d",
1192
+ "attrs": {
1193
+ "kernel": "(3, 3)",
1194
+ "no_bias": "True",
1195
+ "num_filter": "128",
1196
+ "num_group": "128",
1197
+ "pad": "(1, 1)",
1198
+ "stride": "(1, 1)"
1199
+ },
1200
+ "inputs": [[138, 0, 0], [139, 0, 0]]
1201
+ },
1202
+ {
1203
+ "op": "null",
1204
+ "name": "conv_10_dw_batchnorm_gamma",
1205
+ "attrs": {"fix_gamma": "True"},
1206
+ "inputs": []
1207
+ },
1208
+ {
1209
+ "op": "null",
1210
+ "name": "conv_10_dw_batchnorm_beta",
1211
+ "attrs": {"fix_gamma": "True"},
1212
+ "inputs": []
1213
+ },
1214
+ {
1215
+ "op": "null",
1216
+ "name": "conv_10_dw_batchnorm_moving_mean",
1217
+ "attrs": {
1218
+ "__init__": "[\"zero\", {}]",
1219
+ "fix_gamma": "True"
1220
+ },
1221
+ "inputs": []
1222
+ },
1223
+ {
1224
+ "op": "null",
1225
+ "name": "conv_10_dw_batchnorm_moving_var",
1226
+ "attrs": {
1227
+ "__init__": "[\"one\", {}]",
1228
+ "fix_gamma": "True"
1229
+ },
1230
+ "inputs": []
1231
+ },
1232
+ {
1233
+ "op": "BatchNorm",
1234
+ "name": "conv_10_dw_batchnorm",
1235
+ "attrs": {"fix_gamma": "True"},
1236
+ "inputs": [[140, 0, 0], [141, 0, 0], [142, 0, 0], [143, 0, 1], [144, 0, 1]]
1237
+ },
1238
+ {
1239
+ "op": "Activation",
1240
+ "name": "conv_10_dw_relu",
1241
+ "attrs": {"act_type": "relu"},
1242
+ "inputs": [[145, 0, 0]]
1243
+ },
1244
+ {
1245
+ "op": "null",
1246
+ "name": "conv_10_conv2d_weight",
1247
+ "attrs": {
1248
+ "kernel": "(1, 1)",
1249
+ "no_bias": "True",
1250
+ "num_filter": "128",
1251
+ "num_group": "1",
1252
+ "pad": "(0, 0)",
1253
+ "stride": "(1, 1)"
1254
+ },
1255
+ "inputs": []
1256
+ },
1257
+ {
1258
+ "op": "Convolution",
1259
+ "name": "conv_10_conv2d",
1260
+ "attrs": {
1261
+ "kernel": "(1, 1)",
1262
+ "no_bias": "True",
1263
+ "num_filter": "128",
1264
+ "num_group": "1",
1265
+ "pad": "(0, 0)",
1266
+ "stride": "(1, 1)"
1267
+ },
1268
+ "inputs": [[146, 0, 0], [147, 0, 0]]
1269
+ },
1270
+ {
1271
+ "op": "null",
1272
+ "name": "conv_10_batchnorm_gamma",
1273
+ "attrs": {"fix_gamma": "True"},
1274
+ "inputs": []
1275
+ },
1276
+ {
1277
+ "op": "null",
1278
+ "name": "conv_10_batchnorm_beta",
1279
+ "attrs": {"fix_gamma": "True"},
1280
+ "inputs": []
1281
+ },
1282
+ {
1283
+ "op": "null",
1284
+ "name": "conv_10_batchnorm_moving_mean",
1285
+ "attrs": {
1286
+ "__init__": "[\"zero\", {}]",
1287
+ "fix_gamma": "True"
1288
+ },
1289
+ "inputs": []
1290
+ },
1291
+ {
1292
+ "op": "null",
1293
+ "name": "conv_10_batchnorm_moving_var",
1294
+ "attrs": {
1295
+ "__init__": "[\"one\", {}]",
1296
+ "fix_gamma": "True"
1297
+ },
1298
+ "inputs": []
1299
+ },
1300
+ {
1301
+ "op": "BatchNorm",
1302
+ "name": "conv_10_batchnorm",
1303
+ "attrs": {"fix_gamma": "True"},
1304
+ "inputs": [[148, 0, 0], [149, 0, 0], [150, 0, 0], [151, 0, 1], [152, 0, 1]]
1305
+ },
1306
+ {
1307
+ "op": "Activation",
1308
+ "name": "conv_10_relu",
1309
+ "attrs": {"act_type": "relu"},
1310
+ "inputs": [[153, 0, 0]]
1311
+ },
1312
+ {
1313
+ "op": "null",
1314
+ "name": "conv_11_dw_conv2d_weight",
1315
+ "attrs": {
1316
+ "kernel": "(3, 3)",
1317
+ "no_bias": "True",
1318
+ "num_filter": "128",
1319
+ "num_group": "128",
1320
+ "pad": "(1, 1)",
1321
+ "stride": "(1, 1)"
1322
+ },
1323
+ "inputs": []
1324
+ },
1325
+ {
1326
+ "op": "Convolution",
1327
+ "name": "conv_11_dw_conv2d",
1328
+ "attrs": {
1329
+ "kernel": "(3, 3)",
1330
+ "no_bias": "True",
1331
+ "num_filter": "128",
1332
+ "num_group": "128",
1333
+ "pad": "(1, 1)",
1334
+ "stride": "(1, 1)"
1335
+ },
1336
+ "inputs": [[154, 0, 0], [155, 0, 0]]
1337
+ },
1338
+ {
1339
+ "op": "null",
1340
+ "name": "conv_11_dw_batchnorm_gamma",
1341
+ "attrs": {"fix_gamma": "True"},
1342
+ "inputs": []
1343
+ },
1344
+ {
1345
+ "op": "null",
1346
+ "name": "conv_11_dw_batchnorm_beta",
1347
+ "attrs": {"fix_gamma": "True"},
1348
+ "inputs": []
1349
+ },
1350
+ {
1351
+ "op": "null",
1352
+ "name": "conv_11_dw_batchnorm_moving_mean",
1353
+ "attrs": {
1354
+ "__init__": "[\"zero\", {}]",
1355
+ "fix_gamma": "True"
1356
+ },
1357
+ "inputs": []
1358
+ },
1359
+ {
1360
+ "op": "null",
1361
+ "name": "conv_11_dw_batchnorm_moving_var",
1362
+ "attrs": {
1363
+ "__init__": "[\"one\", {}]",
1364
+ "fix_gamma": "True"
1365
+ },
1366
+ "inputs": []
1367
+ },
1368
+ {
1369
+ "op": "BatchNorm",
1370
+ "name": "conv_11_dw_batchnorm",
1371
+ "attrs": {"fix_gamma": "True"},
1372
+ "inputs": [[156, 0, 0], [157, 0, 0], [158, 0, 0], [159, 0, 1], [160, 0, 1]]
1373
+ },
1374
+ {
1375
+ "op": "Activation",
1376
+ "name": "conv_11_dw_relu",
1377
+ "attrs": {"act_type": "relu"},
1378
+ "inputs": [[161, 0, 0]]
1379
+ },
1380
+ {
1381
+ "op": "null",
1382
+ "name": "conv_11_conv2d_weight",
1383
+ "attrs": {
1384
+ "kernel": "(1, 1)",
1385
+ "no_bias": "True",
1386
+ "num_filter": "128",
1387
+ "num_group": "1",
1388
+ "pad": "(0, 0)",
1389
+ "stride": "(1, 1)"
1390
+ },
1391
+ "inputs": []
1392
+ },
1393
+ {
1394
+ "op": "Convolution",
1395
+ "name": "conv_11_conv2d",
1396
+ "attrs": {
1397
+ "kernel": "(1, 1)",
1398
+ "no_bias": "True",
1399
+ "num_filter": "128",
1400
+ "num_group": "1",
1401
+ "pad": "(0, 0)",
1402
+ "stride": "(1, 1)"
1403
+ },
1404
+ "inputs": [[162, 0, 0], [163, 0, 0]]
1405
+ },
1406
+ {
1407
+ "op": "null",
1408
+ "name": "conv_11_batchnorm_gamma",
1409
+ "attrs": {"fix_gamma": "True"},
1410
+ "inputs": []
1411
+ },
1412
+ {
1413
+ "op": "null",
1414
+ "name": "conv_11_batchnorm_beta",
1415
+ "attrs": {"fix_gamma": "True"},
1416
+ "inputs": []
1417
+ },
1418
+ {
1419
+ "op": "null",
1420
+ "name": "conv_11_batchnorm_moving_mean",
1421
+ "attrs": {
1422
+ "__init__": "[\"zero\", {}]",
1423
+ "fix_gamma": "True"
1424
+ },
1425
+ "inputs": []
1426
+ },
1427
+ {
1428
+ "op": "null",
1429
+ "name": "conv_11_batchnorm_moving_var",
1430
+ "attrs": {
1431
+ "__init__": "[\"one\", {}]",
1432
+ "fix_gamma": "True"
1433
+ },
1434
+ "inputs": []
1435
+ },
1436
+ {
1437
+ "op": "BatchNorm",
1438
+ "name": "conv_11_batchnorm",
1439
+ "attrs": {"fix_gamma": "True"},
1440
+ "inputs": [[164, 0, 0], [165, 0, 0], [166, 0, 0], [167, 0, 1], [168, 0, 1]]
1441
+ },
1442
+ {
1443
+ "op": "Activation",
1444
+ "name": "conv_11_relu",
1445
+ "attrs": {"act_type": "relu"},
1446
+ "inputs": [[169, 0, 0]]
1447
+ },
1448
+ {
1449
+ "op": "null",
1450
+ "name": "conv_12_dw_conv2d_weight",
1451
+ "attrs": {
1452
+ "kernel": "(3, 3)",
1453
+ "no_bias": "True",
1454
+ "num_filter": "128",
1455
+ "num_group": "128",
1456
+ "pad": "(1, 1)",
1457
+ "stride": "(1, 1)"
1458
+ },
1459
+ "inputs": []
1460
+ },
1461
+ {
1462
+ "op": "Convolution",
1463
+ "name": "conv_12_dw_conv2d",
1464
+ "attrs": {
1465
+ "kernel": "(3, 3)",
1466
+ "no_bias": "True",
1467
+ "num_filter": "128",
1468
+ "num_group": "128",
1469
+ "pad": "(1, 1)",
1470
+ "stride": "(1, 1)"
1471
+ },
1472
+ "inputs": [[170, 0, 0], [171, 0, 0]]
1473
+ },
1474
+ {
1475
+ "op": "null",
1476
+ "name": "conv_12_dw_batchnorm_gamma",
1477
+ "attrs": {"fix_gamma": "True"},
1478
+ "inputs": []
1479
+ },
1480
+ {
1481
+ "op": "null",
1482
+ "name": "conv_12_dw_batchnorm_beta",
1483
+ "attrs": {"fix_gamma": "True"},
1484
+ "inputs": []
1485
+ },
1486
+ {
1487
+ "op": "null",
1488
+ "name": "conv_12_dw_batchnorm_moving_mean",
1489
+ "attrs": {
1490
+ "__init__": "[\"zero\", {}]",
1491
+ "fix_gamma": "True"
1492
+ },
1493
+ "inputs": []
1494
+ },
1495
+ {
1496
+ "op": "null",
1497
+ "name": "conv_12_dw_batchnorm_moving_var",
1498
+ "attrs": {
1499
+ "__init__": "[\"one\", {}]",
1500
+ "fix_gamma": "True"
1501
+ },
1502
+ "inputs": []
1503
+ },
1504
+ {
1505
+ "op": "BatchNorm",
1506
+ "name": "conv_12_dw_batchnorm",
1507
+ "attrs": {"fix_gamma": "True"},
1508
+ "inputs": [[172, 0, 0], [173, 0, 0], [174, 0, 0], [175, 0, 1], [176, 0, 1]]
1509
+ },
1510
+ {
1511
+ "op": "Activation",
1512
+ "name": "conv_12_dw_relu",
1513
+ "attrs": {"act_type": "relu"},
1514
+ "inputs": [[177, 0, 0]]
1515
+ },
1516
+ {
1517
+ "op": "null",
1518
+ "name": "conv_12_conv2d_weight",
1519
+ "attrs": {
1520
+ "kernel": "(1, 1)",
1521
+ "no_bias": "True",
1522
+ "num_filter": "128",
1523
+ "num_group": "1",
1524
+ "pad": "(0, 0)",
1525
+ "stride": "(1, 1)"
1526
+ },
1527
+ "inputs": []
1528
+ },
1529
+ {
1530
+ "op": "Convolution",
1531
+ "name": "conv_12_conv2d",
1532
+ "attrs": {
1533
+ "kernel": "(1, 1)",
1534
+ "no_bias": "True",
1535
+ "num_filter": "128",
1536
+ "num_group": "1",
1537
+ "pad": "(0, 0)",
1538
+ "stride": "(1, 1)"
1539
+ },
1540
+ "inputs": [[178, 0, 0], [179, 0, 0]]
1541
+ },
1542
+ {
1543
+ "op": "null",
1544
+ "name": "conv_12_batchnorm_gamma",
1545
+ "attrs": {"fix_gamma": "True"},
1546
+ "inputs": []
1547
+ },
1548
+ {
1549
+ "op": "null",
1550
+ "name": "conv_12_batchnorm_beta",
1551
+ "attrs": {"fix_gamma": "True"},
1552
+ "inputs": []
1553
+ },
1554
+ {
1555
+ "op": "null",
1556
+ "name": "conv_12_batchnorm_moving_mean",
1557
+ "attrs": {
1558
+ "__init__": "[\"zero\", {}]",
1559
+ "fix_gamma": "True"
1560
+ },
1561
+ "inputs": []
1562
+ },
1563
+ {
1564
+ "op": "null",
1565
+ "name": "conv_12_batchnorm_moving_var",
1566
+ "attrs": {
1567
+ "__init__": "[\"one\", {}]",
1568
+ "fix_gamma": "True"
1569
+ },
1570
+ "inputs": []
1571
+ },
1572
+ {
1573
+ "op": "BatchNorm",
1574
+ "name": "conv_12_batchnorm",
1575
+ "attrs": {"fix_gamma": "True"},
1576
+ "inputs": [[180, 0, 0], [181, 0, 0], [182, 0, 0], [183, 0, 1], [184, 0, 1]]
1577
+ },
1578
+ {
1579
+ "op": "Activation",
1580
+ "name": "conv_12_relu",
1581
+ "attrs": {"act_type": "relu"},
1582
+ "inputs": [[185, 0, 0]]
1583
+ },
1584
+ {
1585
+ "op": "null",
1586
+ "name": "conv_13_dw_conv2d_weight",
1587
+ "attrs": {
1588
+ "kernel": "(3, 3)",
1589
+ "no_bias": "True",
1590
+ "num_filter": "128",
1591
+ "num_group": "128",
1592
+ "pad": "(1, 1)",
1593
+ "stride": "(2, 2)"
1594
+ },
1595
+ "inputs": []
1596
+ },
1597
+ {
1598
+ "op": "Convolution",
1599
+ "name": "conv_13_dw_conv2d",
1600
+ "attrs": {
1601
+ "kernel": "(3, 3)",
1602
+ "no_bias": "True",
1603
+ "num_filter": "128",
1604
+ "num_group": "128",
1605
+ "pad": "(1, 1)",
1606
+ "stride": "(2, 2)"
1607
+ },
1608
+ "inputs": [[186, 0, 0], [187, 0, 0]]
1609
+ },
1610
+ {
1611
+ "op": "null",
1612
+ "name": "conv_13_dw_batchnorm_gamma",
1613
+ "attrs": {"fix_gamma": "True"},
1614
+ "inputs": []
1615
+ },
1616
+ {
1617
+ "op": "null",
1618
+ "name": "conv_13_dw_batchnorm_beta",
1619
+ "attrs": {"fix_gamma": "True"},
1620
+ "inputs": []
1621
+ },
1622
+ {
1623
+ "op": "null",
1624
+ "name": "conv_13_dw_batchnorm_moving_mean",
1625
+ "attrs": {
1626
+ "__init__": "[\"zero\", {}]",
1627
+ "fix_gamma": "True"
1628
+ },
1629
+ "inputs": []
1630
+ },
1631
+ {
1632
+ "op": "null",
1633
+ "name": "conv_13_dw_batchnorm_moving_var",
1634
+ "attrs": {
1635
+ "__init__": "[\"one\", {}]",
1636
+ "fix_gamma": "True"
1637
+ },
1638
+ "inputs": []
1639
+ },
1640
+ {
1641
+ "op": "BatchNorm",
1642
+ "name": "conv_13_dw_batchnorm",
1643
+ "attrs": {"fix_gamma": "True"},
1644
+ "inputs": [[188, 0, 0], [189, 0, 0], [190, 0, 0], [191, 0, 1], [192, 0, 1]]
1645
+ },
1646
+ {
1647
+ "op": "Activation",
1648
+ "name": "conv_13_dw_relu",
1649
+ "attrs": {"act_type": "relu"},
1650
+ "inputs": [[193, 0, 0]]
1651
+ },
1652
+ {
1653
+ "op": "null",
1654
+ "name": "conv_13_conv2d_weight",
1655
+ "attrs": {
1656
+ "kernel": "(1, 1)",
1657
+ "no_bias": "True",
1658
+ "num_filter": "256",
1659
+ "num_group": "1",
1660
+ "pad": "(0, 0)",
1661
+ "stride": "(1, 1)"
1662
+ },
1663
+ "inputs": []
1664
+ },
1665
+ {
1666
+ "op": "Convolution",
1667
+ "name": "conv_13_conv2d",
1668
+ "attrs": {
1669
+ "kernel": "(1, 1)",
1670
+ "no_bias": "True",
1671
+ "num_filter": "256",
1672
+ "num_group": "1",
1673
+ "pad": "(0, 0)",
1674
+ "stride": "(1, 1)"
1675
+ },
1676
+ "inputs": [[194, 0, 0], [195, 0, 0]]
1677
+ },
1678
+ {
1679
+ "op": "null",
1680
+ "name": "conv_13_batchnorm_gamma",
1681
+ "attrs": {"fix_gamma": "True"},
1682
+ "inputs": []
1683
+ },
1684
+ {
1685
+ "op": "null",
1686
+ "name": "conv_13_batchnorm_beta",
1687
+ "attrs": {"fix_gamma": "True"},
1688
+ "inputs": []
1689
+ },
1690
+ {
1691
+ "op": "null",
1692
+ "name": "conv_13_batchnorm_moving_mean",
1693
+ "attrs": {
1694
+ "__init__": "[\"zero\", {}]",
1695
+ "fix_gamma": "True"
1696
+ },
1697
+ "inputs": []
1698
+ },
1699
+ {
1700
+ "op": "null",
1701
+ "name": "conv_13_batchnorm_moving_var",
1702
+ "attrs": {
1703
+ "__init__": "[\"one\", {}]",
1704
+ "fix_gamma": "True"
1705
+ },
1706
+ "inputs": []
1707
+ },
1708
+ {
1709
+ "op": "BatchNorm",
1710
+ "name": "conv_13_batchnorm",
1711
+ "attrs": {"fix_gamma": "True"},
1712
+ "inputs": [[196, 0, 0], [197, 0, 0], [198, 0, 0], [199, 0, 1], [200, 0, 1]]
1713
+ },
1714
+ {
1715
+ "op": "Activation",
1716
+ "name": "conv_13_relu",
1717
+ "attrs": {"act_type": "relu"},
1718
+ "inputs": [[201, 0, 0]]
1719
+ },
1720
+ {
1721
+ "op": "null",
1722
+ "name": "conv_14_dw_conv2d_weight",
1723
+ "attrs": {
1724
+ "kernel": "(3, 3)",
1725
+ "no_bias": "True",
1726
+ "num_filter": "256",
1727
+ "num_group": "256",
1728
+ "pad": "(1, 1)",
1729
+ "stride": "(1, 1)"
1730
+ },
1731
+ "inputs": []
1732
+ },
1733
+ {
1734
+ "op": "Convolution",
1735
+ "name": "conv_14_dw_conv2d",
1736
+ "attrs": {
1737
+ "kernel": "(3, 3)",
1738
+ "no_bias": "True",
1739
+ "num_filter": "256",
1740
+ "num_group": "256",
1741
+ "pad": "(1, 1)",
1742
+ "stride": "(1, 1)"
1743
+ },
1744
+ "inputs": [[202, 0, 0], [203, 0, 0]]
1745
+ },
1746
+ {
1747
+ "op": "null",
1748
+ "name": "conv_14_dw_batchnorm_gamma",
1749
+ "attrs": {"fix_gamma": "True"},
1750
+ "inputs": []
1751
+ },
1752
+ {
1753
+ "op": "null",
1754
+ "name": "conv_14_dw_batchnorm_beta",
1755
+ "attrs": {"fix_gamma": "True"},
1756
+ "inputs": []
1757
+ },
1758
+ {
1759
+ "op": "null",
1760
+ "name": "conv_14_dw_batchnorm_moving_mean",
1761
+ "attrs": {
1762
+ "__init__": "[\"zero\", {}]",
1763
+ "fix_gamma": "True"
1764
+ },
1765
+ "inputs": []
1766
+ },
1767
+ {
1768
+ "op": "null",
1769
+ "name": "conv_14_dw_batchnorm_moving_var",
1770
+ "attrs": {
1771
+ "__init__": "[\"one\", {}]",
1772
+ "fix_gamma": "True"
1773
+ },
1774
+ "inputs": []
1775
+ },
1776
+ {
1777
+ "op": "BatchNorm",
1778
+ "name": "conv_14_dw_batchnorm",
1779
+ "attrs": {"fix_gamma": "True"},
1780
+ "inputs": [[204, 0, 0], [205, 0, 0], [206, 0, 0], [207, 0, 1], [208, 0, 1]]
1781
+ },
1782
+ {
1783
+ "op": "Activation",
1784
+ "name": "conv_14_dw_relu",
1785
+ "attrs": {"act_type": "relu"},
1786
+ "inputs": [[209, 0, 0]]
1787
+ },
1788
+ {
1789
+ "op": "null",
1790
+ "name": "conv_14_conv2d_weight",
1791
+ "attrs": {
1792
+ "kernel": "(1, 1)",
1793
+ "no_bias": "True",
1794
+ "num_filter": "256",
1795
+ "num_group": "1",
1796
+ "pad": "(0, 0)",
1797
+ "stride": "(1, 1)"
1798
+ },
1799
+ "inputs": []
1800
+ },
1801
+ {
1802
+ "op": "Convolution",
1803
+ "name": "conv_14_conv2d",
1804
+ "attrs": {
1805
+ "kernel": "(1, 1)",
1806
+ "no_bias": "True",
1807
+ "num_filter": "256",
1808
+ "num_group": "1",
1809
+ "pad": "(0, 0)",
1810
+ "stride": "(1, 1)"
1811
+ },
1812
+ "inputs": [[210, 0, 0], [211, 0, 0]]
1813
+ },
1814
+ {
1815
+ "op": "null",
1816
+ "name": "conv_14_batchnorm_gamma",
1817
+ "attrs": {"fix_gamma": "True"},
1818
+ "inputs": []
1819
+ },
1820
+ {
1821
+ "op": "null",
1822
+ "name": "conv_14_batchnorm_beta",
1823
+ "attrs": {"fix_gamma": "True"},
1824
+ "inputs": []
1825
+ },
1826
+ {
1827
+ "op": "null",
1828
+ "name": "conv_14_batchnorm_moving_mean",
1829
+ "attrs": {
1830
+ "__init__": "[\"zero\", {}]",
1831
+ "fix_gamma": "True"
1832
+ },
1833
+ "inputs": []
1834
+ },
1835
+ {
1836
+ "op": "null",
1837
+ "name": "conv_14_batchnorm_moving_var",
1838
+ "attrs": {
1839
+ "__init__": "[\"one\", {}]",
1840
+ "fix_gamma": "True"
1841
+ },
1842
+ "inputs": []
1843
+ },
1844
+ {
1845
+ "op": "BatchNorm",
1846
+ "name": "conv_14_batchnorm",
1847
+ "attrs": {"fix_gamma": "True"},
1848
+ "inputs": [[212, 0, 0], [213, 0, 0], [214, 0, 0], [215, 0, 1], [216, 0, 1]]
1849
+ },
1850
+ {
1851
+ "op": "Activation",
1852
+ "name": "conv_14_relu",
1853
+ "attrs": {"act_type": "relu"},
1854
+ "inputs": [[217, 0, 0]]
1855
+ },
1856
+ {
1857
+ "op": "null",
1858
+ "name": "bn1_gamma",
1859
+ "attrs": {
1860
+ "eps": "2e-05",
1861
+ "fix_gamma": "False",
1862
+ "momentum": "0.9"
1863
+ },
1864
+ "inputs": []
1865
+ },
1866
+ {
1867
+ "op": "null",
1868
+ "name": "bn1_beta",
1869
+ "attrs": {
1870
+ "eps": "2e-05",
1871
+ "fix_gamma": "False",
1872
+ "momentum": "0.9"
1873
+ },
1874
+ "inputs": []
1875
+ },
1876
+ {
1877
+ "op": "null",
1878
+ "name": "bn1_moving_mean",
1879
+ "attrs": {
1880
+ "__init__": "[\"zero\", {}]",
1881
+ "eps": "2e-05",
1882
+ "fix_gamma": "False",
1883
+ "momentum": "0.9"
1884
+ },
1885
+ "inputs": []
1886
+ },
1887
+ {
1888
+ "op": "null",
1889
+ "name": "bn1_moving_var",
1890
+ "attrs": {
1891
+ "__init__": "[\"one\", {}]",
1892
+ "eps": "2e-05",
1893
+ "fix_gamma": "False",
1894
+ "momentum": "0.9"
1895
+ },
1896
+ "inputs": []
1897
+ },
1898
+ {
1899
+ "op": "BatchNorm",
1900
+ "name": "bn1",
1901
+ "attrs": {
1902
+ "eps": "2e-05",
1903
+ "fix_gamma": "False",
1904
+ "momentum": "0.9"
1905
+ },
1906
+ "inputs": [[218, 0, 0], [219, 0, 0], [220, 0, 0], [221, 0, 1], [222, 0, 1]]
1907
+ },
1908
+ {
1909
+ "op": "null",
1910
+ "name": "relu1_gamma",
1911
+ "attrs": {
1912
+ "__init__": "[\"Constant\", {\"value\": 0.25}]",
1913
+ "act_type": "prelu"
1914
+ },
1915
+ "inputs": []
1916
+ },
1917
+ {
1918
+ "op": "LeakyReLU",
1919
+ "name": "relu1",
1920
+ "attrs": {"act_type": "prelu"},
1921
+ "inputs": [[223, 0, 0], [224, 0, 0]]
1922
+ },
1923
+ {
1924
+ "op": "Pooling",
1925
+ "name": "pool1",
1926
+ "attrs": {
1927
+ "global_pool": "True",
1928
+ "kernel": "(7, 7)",
1929
+ "pool_type": "avg"
1930
+ },
1931
+ "inputs": [[225, 0, 0]]
1932
+ },
1933
+ {
1934
+ "op": "Flatten",
1935
+ "name": "flatten0",
1936
+ "inputs": [[226, 0, 0]]
1937
+ },
1938
+ {
1939
+ "op": "null",
1940
+ "name": "pre_fc1_weight",
1941
+ "attrs": {"num_hidden": "202"},
1942
+ "inputs": []
1943
+ },
1944
+ {
1945
+ "op": "null",
1946
+ "name": "pre_fc1_bias",
1947
+ "attrs": {"num_hidden": "202"},
1948
+ "inputs": []
1949
+ },
1950
+ {
1951
+ "op": "FullyConnected",
1952
+ "name": "pre_fc1",
1953
+ "attrs": {"num_hidden": "202"},
1954
+ "inputs": [[227, 0, 0], [228, 0, 0], [229, 0, 0]]
1955
+ },
1956
+ {
1957
+ "op": "null",
1958
+ "name": "fc1_gamma",
1959
+ "attrs": {
1960
+ "eps": "2e-05",
1961
+ "fix_gamma": "True",
1962
+ "momentum": "0.9"
1963
+ },
1964
+ "inputs": []
1965
+ },
1966
+ {
1967
+ "op": "null",
1968
+ "name": "fc1_beta",
1969
+ "attrs": {
1970
+ "eps": "2e-05",
1971
+ "fix_gamma": "True",
1972
+ "momentum": "0.9"
1973
+ },
1974
+ "inputs": []
1975
+ },
1976
+ {
1977
+ "op": "null",
1978
+ "name": "fc1_moving_mean",
1979
+ "attrs": {
1980
+ "__init__": "[\"zero\", {}]",
1981
+ "eps": "2e-05",
1982
+ "fix_gamma": "True",
1983
+ "momentum": "0.9"
1984
+ },
1985
+ "inputs": []
1986
+ },
1987
+ {
1988
+ "op": "null",
1989
+ "name": "fc1_moving_var",
1990
+ "attrs": {
1991
+ "__init__": "[\"one\", {}]",
1992
+ "eps": "2e-05",
1993
+ "fix_gamma": "True",
1994
+ "momentum": "0.9"
1995
+ },
1996
+ "inputs": []
1997
+ },
1998
+ {
1999
+ "op": "BatchNorm",
2000
+ "name": "fc1",
2001
+ "attrs": {
2002
+ "eps": "2e-05",
2003
+ "fix_gamma": "True",
2004
+ "momentum": "0.9"
2005
+ },
2006
+ "inputs": [[230, 0, 0], [231, 0, 0], [232, 0, 0], [233, 0, 1], [234, 0, 1]]
2007
+ }
2008
+ ],
2009
+ "arg_nodes": [
2010
+ 0,
2011
+ 3,
2012
+ 5,
2013
+ 6,
2014
+ 7,
2015
+ 8,
2016
+ 11,
2017
+ 13,
2018
+ 14,
2019
+ 15,
2020
+ 16,
2021
+ 19,
2022
+ 21,
2023
+ 22,
2024
+ 23,
2025
+ 24,
2026
+ 27,
2027
+ 29,
2028
+ 30,
2029
+ 31,
2030
+ 32,
2031
+ 35,
2032
+ 37,
2033
+ 38,
2034
+ 39,
2035
+ 40,
2036
+ 43,
2037
+ 45,
2038
+ 46,
2039
+ 47,
2040
+ 48,
2041
+ 51,
2042
+ 53,
2043
+ 54,
2044
+ 55,
2045
+ 56,
2046
+ 59,
2047
+ 61,
2048
+ 62,
2049
+ 63,
2050
+ 64,
2051
+ 67,
2052
+ 69,
2053
+ 70,
2054
+ 71,
2055
+ 72,
2056
+ 75,
2057
+ 77,
2058
+ 78,
2059
+ 79,
2060
+ 80,
2061
+ 83,
2062
+ 85,
2063
+ 86,
2064
+ 87,
2065
+ 88,
2066
+ 91,
2067
+ 93,
2068
+ 94,
2069
+ 95,
2070
+ 96,
2071
+ 99,
2072
+ 101,
2073
+ 102,
2074
+ 103,
2075
+ 104,
2076
+ 107,
2077
+ 109,
2078
+ 110,
2079
+ 111,
2080
+ 112,
2081
+ 115,
2082
+ 117,
2083
+ 118,
2084
+ 119,
2085
+ 120,
2086
+ 123,
2087
+ 125,
2088
+ 126,
2089
+ 127,
2090
+ 128,
2091
+ 131,
2092
+ 133,
2093
+ 134,
2094
+ 135,
2095
+ 136,
2096
+ 139,
2097
+ 141,
2098
+ 142,
2099
+ 143,
2100
+ 144,
2101
+ 147,
2102
+ 149,
2103
+ 150,
2104
+ 151,
2105
+ 152,
2106
+ 155,
2107
+ 157,
2108
+ 158,
2109
+ 159,
2110
+ 160,
2111
+ 163,
2112
+ 165,
2113
+ 166,
2114
+ 167,
2115
+ 168,
2116
+ 171,
2117
+ 173,
2118
+ 174,
2119
+ 175,
2120
+ 176,
2121
+ 179,
2122
+ 181,
2123
+ 182,
2124
+ 183,
2125
+ 184,
2126
+ 187,
2127
+ 189,
2128
+ 190,
2129
+ 191,
2130
+ 192,
2131
+ 195,
2132
+ 197,
2133
+ 198,
2134
+ 199,
2135
+ 200,
2136
+ 203,
2137
+ 205,
2138
+ 206,
2139
+ 207,
2140
+ 208,
2141
+ 211,
2142
+ 213,
2143
+ 214,
2144
+ 215,
2145
+ 216,
2146
+ 219,
2147
+ 220,
2148
+ 221,
2149
+ 222,
2150
+ 224,
2151
+ 228,
2152
+ 229,
2153
+ 231,
2154
+ 232,
2155
+ 233,
2156
+ 234
2157
+ ],
2158
+ "node_row_ptr": [
2159
+ 0,
2160
+ 1,
2161
+ 2,
2162
+ 3,
2163
+ 4,
2164
+ 5,
2165
+ 6,
2166
+ 7,
2167
+ 8,
2168
+ 9,
2169
+ 12,
2170
+ 13,
2171
+ 14,
2172
+ 15,
2173
+ 16,
2174
+ 17,
2175
+ 18,
2176
+ 19,
2177
+ 22,
2178
+ 23,
2179
+ 24,
2180
+ 25,
2181
+ 26,
2182
+ 27,
2183
+ 28,
2184
+ 29,
2185
+ 32,
2186
+ 33,
2187
+ 34,
2188
+ 35,
2189
+ 36,
2190
+ 37,
2191
+ 38,
2192
+ 39,
2193
+ 42,
2194
+ 43,
2195
+ 44,
2196
+ 45,
2197
+ 46,
2198
+ 47,
2199
+ 48,
2200
+ 49,
2201
+ 52,
2202
+ 53,
2203
+ 54,
2204
+ 55,
2205
+ 56,
2206
+ 57,
2207
+ 58,
2208
+ 59,
2209
+ 62,
2210
+ 63,
2211
+ 64,
2212
+ 65,
2213
+ 66,
2214
+ 67,
2215
+ 68,
2216
+ 69,
2217
+ 72,
2218
+ 73,
2219
+ 74,
2220
+ 75,
2221
+ 76,
2222
+ 77,
2223
+ 78,
2224
+ 79,
2225
+ 82,
2226
+ 83,
2227
+ 84,
2228
+ 85,
2229
+ 86,
2230
+ 87,
2231
+ 88,
2232
+ 89,
2233
+ 92,
2234
+ 93,
2235
+ 94,
2236
+ 95,
2237
+ 96,
2238
+ 97,
2239
+ 98,
2240
+ 99,
2241
+ 102,
2242
+ 103,
2243
+ 104,
2244
+ 105,
2245
+ 106,
2246
+ 107,
2247
+ 108,
2248
+ 109,
2249
+ 112,
2250
+ 113,
2251
+ 114,
2252
+ 115,
2253
+ 116,
2254
+ 117,
2255
+ 118,
2256
+ 119,
2257
+ 122,
2258
+ 123,
2259
+ 124,
2260
+ 125,
2261
+ 126,
2262
+ 127,
2263
+ 128,
2264
+ 129,
2265
+ 132,
2266
+ 133,
2267
+ 134,
2268
+ 135,
2269
+ 136,
2270
+ 137,
2271
+ 138,
2272
+ 139,
2273
+ 142,
2274
+ 143,
2275
+ 144,
2276
+ 145,
2277
+ 146,
2278
+ 147,
2279
+ 148,
2280
+ 149,
2281
+ 152,
2282
+ 153,
2283
+ 154,
2284
+ 155,
2285
+ 156,
2286
+ 157,
2287
+ 158,
2288
+ 159,
2289
+ 162,
2290
+ 163,
2291
+ 164,
2292
+ 165,
2293
+ 166,
2294
+ 167,
2295
+ 168,
2296
+ 169,
2297
+ 172,
2298
+ 173,
2299
+ 174,
2300
+ 175,
2301
+ 176,
2302
+ 177,
2303
+ 178,
2304
+ 179,
2305
+ 182,
2306
+ 183,
2307
+ 184,
2308
+ 185,
2309
+ 186,
2310
+ 187,
2311
+ 188,
2312
+ 189,
2313
+ 192,
2314
+ 193,
2315
+ 194,
2316
+ 195,
2317
+ 196,
2318
+ 197,
2319
+ 198,
2320
+ 199,
2321
+ 202,
2322
+ 203,
2323
+ 204,
2324
+ 205,
2325
+ 206,
2326
+ 207,
2327
+ 208,
2328
+ 209,
2329
+ 212,
2330
+ 213,
2331
+ 214,
2332
+ 215,
2333
+ 216,
2334
+ 217,
2335
+ 218,
2336
+ 219,
2337
+ 222,
2338
+ 223,
2339
+ 224,
2340
+ 225,
2341
+ 226,
2342
+ 227,
2343
+ 228,
2344
+ 229,
2345
+ 232,
2346
+ 233,
2347
+ 234,
2348
+ 235,
2349
+ 236,
2350
+ 237,
2351
+ 238,
2352
+ 239,
2353
+ 242,
2354
+ 243,
2355
+ 244,
2356
+ 245,
2357
+ 246,
2358
+ 247,
2359
+ 248,
2360
+ 249,
2361
+ 252,
2362
+ 253,
2363
+ 254,
2364
+ 255,
2365
+ 256,
2366
+ 257,
2367
+ 258,
2368
+ 259,
2369
+ 262,
2370
+ 263,
2371
+ 264,
2372
+ 265,
2373
+ 266,
2374
+ 267,
2375
+ 268,
2376
+ 269,
2377
+ 272,
2378
+ 273,
2379
+ 274,
2380
+ 275,
2381
+ 276,
2382
+ 277,
2383
+ 280,
2384
+ 281,
2385
+ 282,
2386
+ 283,
2387
+ 284,
2388
+ 285,
2389
+ 286,
2390
+ 287,
2391
+ 288,
2392
+ 289,
2393
+ 290,
2394
+ 291,
2395
+ 294
2396
+ ],
2397
+ "heads": [[235, 0, 0]],
2398
+ "attrs": {"mxnet_version": ["int", 10300]}
2399
+ }
insightface/models/retinaface_r50_v1/R50-0000.params ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20818d53adcefea4d3c4f31ba555910b9d052836588607af50af28cb414cb31e
3
+ size 118010124
insightface/models/retinaface_r50_v1/R50-symbol.json ADDED
The diff for this file is too large to render. See raw diff
 
insightface/models/scrfd_10g/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:963570df5e0ebf6bb313239d0f9f3f0c096c1ff6937e8e28e45abad4d8b1d5c7
3
+ size 15545065
insightface/models/scrfd_10g_bnkps/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d431436577d01c827abd78aa40c782b8fb318c26555ac60582144aaf66867411
3
+ size 17005828
insightface/models/scrfd_1g/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7d7d654c992c1581270461466a52c876234ad8be0ad8de37b9782d9f03beb86
3
+ size 2647067
insightface/models/scrfd_2.5g/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbe1d35ac6e0859307067bc3ccd44973b536b451437d23547fc460a05d00993f
3
+ size 2781443
insightface/models/scrfd_2.5g_bnkps/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3db3b99c09e9212e9f2bb3970f6e641ec1812f27b19753f68289326067209662
3
+ size 3346972
insightface/models/scrfd_34g/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6f69956639da31c96d8985c9a0ce1f5798f42cb64909159596e7a5f544ebe00
3
+ size 39677731
insightface/models/scrfd_500m/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1043ab96cff67ee8ebb5fc2819f23f3620a128d133f5b5234cd2aedeeb83b5f0
3
+ size 2404021
insightface/models/scrfd_500m_bnkps/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b04315df8db019067edacaceb73484e531981442f321432b8bf003e9812d6b3d
3
+ size 2669108
insightface/models/scrfd_person_2.5g.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76522ba15eecb0712780509e912884aba066e9834be0c85761918cdcf76de5b5
3
+ size 3710223
insightface/models/synthetic_resnet50d.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01b3d5533999da3e605e5b9d99fb0a2a55e634467346c7504e3fbf778cfb219e
3
+ size 190838028
talknet-asd/.dockerignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The .dockerignore file excludes files from the container build process.
2
+ #
3
+ # https://docs.docker.com/engine/reference/builder/#dockerignore-file
4
+
5
+ # Cog
6
+ /demo/*
7
+
8
+ # Exclude Git files
9
+ .git
10
+ .github
11
+ .gitignore
12
+
13
+ # Exclude Python cache files
14
+ __pycache__
15
+ .mypy_cache
16
+ .pytest_cache
17
+ .ruff_cache
18
+
19
+ # Exclude Python virtual environment
20
+ /venv
talknet-asd/.gitignore ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Other files
2
+ *.model
3
+ *.pth
4
+ *.wav
5
+ *.mp4
6
+ *.txt
7
+ *.pcm
8
+ *.avi
9
+ data/
10
+ tests/
11
+ exps/
12
+ /demo/*
13
+ .cog
14
+
15
+ # Byte-compiled / optimized / DLL files
16
+ __pycache__/
17
+ *.py[cod]
18
+ *$py.class
19
+
20
+ # C extensions
21
+ *.so
22
+
23
+ # Distribution / packaging
24
+ .Python
25
+ build/
26
+ develop-eggs/
27
+ dist/
28
+ downloads/
29
+ eggs/
30
+ .eggs/
31
+ lib/
32
+ lib64/
33
+ parts/
34
+ sdist/
35
+ var/
36
+ wheels/
37
+ *.egg-info/
38
+ .installed.cfg
39
+ *.egg
40
+ MANIFEST
41
+
42
+ # PyInstaller
43
+ # Usually these files are written by a python script from a template
44
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
45
+ *.manifest
46
+ *.spec
47
+
48
+ # Installer logs
49
+ pip-log.txt
50
+ pip-delete-this-directory.txt
51
+
52
+ # Unit test / coverage reports
53
+ htmlcov/
54
+ .tox/
55
+ .coverage
56
+ .coverage.*
57
+ .cache
58
+ nosetests.xml
59
+ coverage.xml
60
+ *.cover
61
+ .hypothesis/
62
+ .pytest_cache/
63
+
64
+ # Translations
65
+ *.mo
66
+ *.pot
67
+
68
+ # Django stuff:
69
+ *.log
70
+ local_settings.py
71
+ db.sqlite3
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ target/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # pyenv
90
+ .python-version
91
+
92
+ # celery beat schedule file
93
+ celerybeat-schedule
94
+
95
+ # SageMath parsed files
96
+ *.sage.py
97
+
98
+ # Environments
99
+ .env
100
+ .venv
101
+ env/
102
+ venv/
103
+ ENV/
104
+ env.bak/
105
+ venv.bak/
106
+
107
+ # Spyder project settings
108
+ .spyderproject
109
+ .spyproject
110
+
111
+ # Rope project settings
112
+ .ropeproject
113
+
114
+ # mkdocs documentation
115
+ /site
116
+
117
+ # mypy
118
+ .mypy_cache/
talknet-asd/FAQ.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 1. General Question
2
+
3
+ ### 1.1 Which dataset is used for training and testing ?
4
+ 'pretrain_AVA.model' is trained on the AVA training set and evaluate on the AVA val/test set, (Has the entire code)
5
+ 'pretrain_TalkSet.model' is trained on our TalkSet and evaluate on the Columbia ASD set or other raw videos.
6
+
7
+ ### 1.2 How to figure the variable length of data during training ?
8
+ We design a scheme to feed the variable-length data into the same mini-batch: we sort all videos by their length and put the videos with similar length into the same batch. We crop all videos into the minimum number of frames in this batch. In this way, we train the TalkNet with videos of different length without losing too much data.
9
+
10
+ ### 1.3 How to figure multiple faces on the screen ?
11
+ In the ASD task, when there are multiple face tracks in the video, we consider one track at a time. The face track of interest is given in each test trial. You can also consider the relationship between the faces on the screen at the same time. There are some papers about that.
12
+ ### 1.4 Error: RuntimeError: CUDA error: no kernel image is available for execution on the device
13
+ Do "pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U", check this [page](https://github.com/pytorch/pytorch/issues/31285#issuecomment-739139454).
14
+
15
+ ### 1.5 Can not download csv, video data or pretrain model ?
16
+ I use google drive to upload the pretrain model and csv files. So you need to make sure you can use google drive under your internet. The error during extract video clips can be ignored.
17
+
18
+ ***
19
+
20
+ ## 2. TalkNet in AVA-Activespeaker dataset
21
+
22
+ ### 2.1 Can not reimplement the result ?
23
+ In our experiments, for the result in AVA validation set, for the same code/model, the best training result is 92.6mAP, the worst one is 92.1mAP. So it is reasonable if you get the result little different than this 92.3mAP. Also batchsize might effect the result (not too much).
24
+
25
+ ### 2.2 How to get the result in AVA test set ?
26
+ AVA test set did not release the labels. So you need to upload your csv result in their system. Notice that we delete add the first line in the `test_res.csv` file since we modify a bit for the `get_ava_active_speaker_performance.py`. You need to delete the first line when you upload it. For the upload file, you need to set all `label` as `SPEAKING_AUDIBLE`.
27
+
28
+ ### 2.3 What are the labels ? Where is SPEAKING_BUT_NOT_AUDIBLE ?
29
+ There are three labels: SPEAKING_AND_AUDIBLE, SPEAKING_BUT_NOT_AUDIBLE, NOT_SPEAKING, but in the finally evaluation, SPEAKING_BUT_NOT_AUDIBLE and NOT_SPEAKING share the same label. So this is a binary classification issue.
30
+
31
+ ### 2.4 How big your model ? How long for training?
32
+ Our model has 15.01M params, in one 22G GPU, each epoch we train 15 mins, evaluate in val set takes 5 mins. Train 25 epochs can get the best result. So at most 7 hours.
33
+
34
+ ***
35
+
36
+ ## 3. TalkNet in TalkSet and Columbia ASD dataset
37
+
38
+ ### 3.1 Why you build TalkSet instead of only use AVA dataset ?
39
+ Because we want our model can be used for all videos in the wild. AVA dataset has already provide the face bounding box for each trial, so for the videos not in AVA. If you want to do ASD, you need to do face detection first. In our experiments, the face detection method used in AVA is hard to reimplement. Which means we can hardly get the face area that similar to the detected face in AVA. Due to that, the model trained in AVA can not perform well in videos outside AVA if we use other face detection method.
40
+ Due to that, we build TalkSet, the face in TalkSet is all detected by S3FD. So for any raw video (Such as the videos in Col ASD dataset), we can use S3FD to do face detection first, then apply our TalkNet model to get the ASD label.
41
+
42
+ ### 3.2 TalkSet code can not work?
43
+ We did not verify this code. Because we just modify LRS3 and VoxCeleb2 to build this set, so we do not (or cannot) upload this set. This `generate_TalkSet.py` is what we used to generate this dataset, and we did not check it later. So it just used for your reference. We have already provide the data list, so you can generate this dataset based on it.
44
+
45
+ ***
46
+
47
+ ## 4. An ASD Demo with pretrained TalkNet model
48
+
49
+ ### 4.1 I try the demo, the performance is not so good ?
50
+ You can check the demo video `001.mp4` first and compare your output and our result `001_res.mp4` to make sure what you did is correct. So if you are not statisfied with the performance. We are sorry about that (~cry). We think this model can further improve. For the very short clips (less than 1s), small face and side face, the performance is not so good.
51
+
52
+ ### 4.2 I try the demo, the face can not be detected ?
53
+ That is the reason for the face detection model instead of the ASD part. You can use better face detecion model such as [Insightface](https://github.com/deepinsight/insightface
54
+ ). Only when the face can be detected, ASD model can perform to get the ASD labels.
talknet-asd/LICENSE.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Tao Ruijie
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
talknet-asd/README.md ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Is someone talking? TalkNet: Audio-visual active speaker detection Model
2
+
3
+ This repository contains the code for our ACM MM 2021 paper (oral), TalkNet, an active speaker detection model to detect 'whether the face in the screen is speaking or not?'. [[Paper](https://arxiv.org/pdf/2107.06592.pdf)] [[Video_English](https://youtu.be/C6bpAgI9zxE)] [[Video_Chinese](https://www.bilibili.com/video/bv1Yw411d7HG)].
4
+
5
+ ### Updates:
6
+
7
+ A new [demo page](https://www.sievedata.com/functions/sieve/talknet-asd). Thanks the contribution from [mvoodarla](https://github.com/mvoodarla) !
8
+
9
+ ![overall.png](utils/overall.png)
10
+
11
+ - [**Awesome ASD**](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/awesomeASD.md): Papers about active speaker detection in last years.
12
+
13
+ - **TalkNet in AVA-Activespeaker dataset**: The code to preprocess the AVA-ActiveSpeaker dataset, train TalkNet in AVA train set and evaluate it in AVA val/test set.
14
+
15
+ - **TalkNet in TalkSet and Columbia ASD dataset**: The code to generate TalkSet, an ASD dataset in the wild, based on VoxCeleb2 and LRS3, train TalkNet in TalkSet and evaluate it in Columnbia ASD dataset.
16
+
17
+ - **An ASD Demo with pretrained TalkNet model**: An end-to-end script to detect and mark the speaking face by the pretrained TalkNet model.
18
+
19
+ ***
20
+
21
+ ### Dependencies
22
+
23
+ Start from building the environment
24
+ ```
25
+ conda create -n TalkNet python=3.7.9 anaconda
26
+ conda activate TalkNet
27
+ pip install -r requirement.txt
28
+ ```
29
+
30
+ Start from the existing environment
31
+ ```
32
+ pip install -r requirement.txt
33
+ ```
34
+
35
+ ***
36
+
37
+ ## TalkNet in AVA-Activespeaker dataset
38
+
39
+ #### Data preparation
40
+
41
+ The following script can be used to download and prepare the AVA dataset for training.
42
+
43
+ ```
44
+ python trainTalkNet.py --dataPathAVA AVADataPath --download
45
+ ```
46
+
47
+ `AVADataPath` is the folder you want to save the AVA dataset and its preprocessing outputs, the details can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/utils/tools.py#L34) . Please read them carefully.
48
+
49
+ #### Training
50
+ Then you can train TalkNet in AVA end-to-end by using:
51
+ ```
52
+ python trainTalkNet.py --dataPathAVA AVADataPath
53
+ ```
54
+ `exps/exps1/score.txt`: output score file, `exps/exp1/model/model_00xx.model`: trained model, `exps/exps1/val_res.csv`: prediction for val set.
55
+
56
+ #### Pretrained model
57
+ Our pretrained model performs `mAP: 92.3` in validation set, you can check it by using:
58
+ ```
59
+ python trainTalkNet.py --dataPathAVA AVADataPath --evaluation
60
+ ```
61
+ The pretrained model will automaticly be downloaded into `TalkNet_ASD/pretrain_AVA.model`. It performs `mAP: 90.8` in the testing set.
62
+
63
+ ***
64
+
65
+ ## TalkNet in TalkSet and Columbia ASD dataset
66
+
67
+ #### Data preparation
68
+
69
+ We find that it is challenge to apply the model we trained in AVA for the videos not in AVA (Reason is [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/FAQ.md), Q3.1). So we build TalkSet, an active speaker detection dataset in the wild, based on `VoxCeleb2` and `LRS3`.
70
+
71
+ We do not plan to upload this dataset since we just modify it, instead of building it. In `TalkSet` folder we provide these `.txt` files to describe which files we used to generate the TalkSet and their ASD labels. You can generate this `TalkSet` if you are interested to train an ASD model in the wild.
72
+
73
+ Also, we have provided our pretrained TalkNet model in TalkSet. You can evaluate it in Columbia ASD dataset or other raw videos in the wild.
74
+
75
+ #### Usage
76
+
77
+ A pretrain model in TalkSet will be download into `TalkNet_ASD/pretrain_TalkSet.model` when using the following script:
78
+
79
+ ```
80
+ python demoTalkNet.py --evalCol --colSavePath colDataPath
81
+ ```
82
+
83
+ Also, Columnbia ASD dataset and the labels will be downloaded into `colDataPath`. Finally you can get the following F1 result.
84
+
85
+ | Name | Bell | Boll | Lieb | Long | Sick | Avg. |
86
+ |----- | ------ | ------ | ------ | ------ | ------ | ------ |
87
+ | F1 | 98.1 | 88.8 | 98.7 | 98.0 | 97.7 | 96.3 |
88
+
89
+ (This result is different from that in our paper because we train the model again, while the avg. F1 is very similar)
90
+ ***
91
+
92
+ ## An ASD Demo with pretrained TalkNet model
93
+
94
+ #### Data preparation
95
+
96
+ We build an end-to-end script to detect and extract the active speaker from the raw video by our pretrain model in TalkSet.
97
+
98
+ You can put the raw video (`.mp4` and `.avi` are both fine) into the `demo` folder, such as `001.mp4`.
99
+
100
+ #### Usage
101
+
102
+ ```
103
+ python demoTalkNet.py --videoName 001
104
+ ```
105
+
106
+ A pretrain model in TalkSet will be downloaded into `TalkNet_ASD/pretrain_TalkSet.model`. The structure of the output reults can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/demoTalkNet.py#L351).
107
+
108
+ You can get the output video `demo/001/pyavi/video_out.avi`, which has marked the active speaker by green box and non-active speaker by red box.
109
+
110
+ If you want to evaluate by using cpu only, you can modify `demoTalkNet.py` and `talkNet.py` file: modify all `cuda` into `cpu`. Then replace line 83 in talkNet.py into `loadedState = torch.load(path,map_location=torch.device('cpu'))`
111
+
112
+ ***
113
+
114
+ ### Citation
115
+
116
+ Please cite the following if our paper or code is helpful to your research.
117
+ ```
118
+ @inproceedings{tao2021someone,
119
+ title={Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual Active Speaker Detection},
120
+ author={Tao, Ruijie and Pan, Zexu and Das, Rohan Kumar and Qian, Xinyuan and Shou, Mike Zheng and Li, Haizhou},
121
+ booktitle = {Proceedings of the 29th ACM International Conference on Multimedia},
122
+ pages = {3927–3935},
123
+ year={2021}
124
+ }
125
+ ```
126
+ I have summaried some potential [FAQs](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/FAQ.md). You can also check the `issues` in Github for other questions that I have answered.
127
+
128
+ This is my first open-source work, please let me know if I can future improve in this repositories or there is anything wrong in our work. Thanks for your support!
129
+
130
+ ### Acknowledge
131
+
132
+ We study many useful projects in our codeing process, which includes:
133
+
134
+ The structure of the project layout and the audio encoder is learnt from this [repository](https://github.com/clovaai/voxceleb_trainer).
135
+
136
+ Demo for visulization is modified from this [repository](https://github.com/joonson/syncnet_python).
137
+
138
+ AVA data download code is learnt from this [repository](https://github.com/fuankarion/active-speakers-context).
139
+
140
+ The model for the visual frontend is learnt from this [repository](https://github.com/lordmartian/deep_avsr).
141
+
142
+ Thanks for these authors to open source their code!
143
+
144
+ ### Cooperation
145
+
146
+ If you are interested to work on this topic and have some ideas to implement, I am glad to collaborate and contribute with my experiences & knowlegde in this topic. Please contact me with ruijie.tao@u.nus.edu.
talknet-asd/TalkSet/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### TalkSet Generation
2
+
3
+ You can check the 'train.txt' and 'test.txt' to generate TalkSet by your own.
4
+
5
+ This `generate_TalkSet.py` code is just used for your reference, I did not check it recently.
6
+
7
+ Input the LRS3, VoxCeleb2, 3 list files in `lists_in`
8
+ Output TalkSet, train.txt, test.txt (Here the test set is the validation set actually)
9
+
10
+ ### Usage:
11
+
12
+ Set the following parser based on the location of your data:
13
+
14
+ `out_path`: the output TalkSet location
15
+ `Vox_audio`: Location of the Vox2, training set, audio location
16
+ `Vox_video`: Location of the Vox2, training set, video location
17
+ `lrs3_audio`: Location of the LRS3, audio location
18
+ `lrs3_video`: Location of the LRS3, video location
19
+ `task`: The part of the TalkSet you want to generate, eg: TAudio
20
+ `num_cpu`: The num of the threads, higher will be faster, based on your PC performance, eg: 10
21
+
22
+ ```
23
+ python TalkSet/generate_TalkSet.py --task 'TAudio'
24
+ python TalkSet/generate_TalkSet.py --task 'FAudio'
25
+ python TalkSet/generate_TalkSet.py --task 'TFAudio'
26
+ python TalkSet/generate_TalkSet.py --task 'TSilence'
27
+ python TalkSet/generate_TalkSet.py --task 'FSilence'
28
+ python TalkSet/generate_TalkSet.py --task 'Fusion'
29
+ ```
30
+
31
+ ### Description:
32
+ For `lists_out\*.txt` files:
33
+ - The 1st row is the face clips data type,
34
+ - TAudio: audio is active, lip is moving, audio and lip are sync
35
+ - FAudio: audio is active, lip is moving, audio and lip are not sync (Speech from others)
36
+ - TFAudio: one part is 'TAudio', the other part is 'FAudio'
37
+ - TSilence: one part is 'TAudio', in the other part, audio is non-active, lip is not moving
38
+ - FSilence: one part is 'silence'(audio is non-active, lip is not moving), in the other part, audio is active, lip is not moving (Speech from others)
39
+ - The 2nd row is the path for the audio file (filename started from 'silence' is the data from LRS3, filename started from 'id.....' is the data from VoxCeleb2)
40
+ - The 3rd row is the path for the video file
41
+ - The 4th row is the length(seconds) of this data
42
+ - The 5th row is the start of 'active' clip (in FSilence, it presents the 'silence' part)
43
+ - The 6th row is the end of 'active' clip
44
+ - The 7th row is the start of 'non-active' clip (in FSilence, it presents the 'speech from others' part)
45
+ - The 8th row is the end of 'non-active' clip
46
+ - The 9th row is the file ID
47
+
48
+ The dataset generated will not be fixed each time because we randomly select FSlience data, and the change point is the random number. We believe the result will be similar. The whole time to generate the TalkSet will use about 3 to 6 hours in our experiments.
talknet-asd/TalkSet/generate_TalkSet.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, glob, subprocess, argparse, sys, numpy, random, math, cv2
2
+ from itertools import repeat
3
+ from multiprocessing import Pool
4
+ from scipy.io import wavfile
5
+ from pydub import AudioSegment
6
+ from tqdm import tqdm
7
+
8
+ def get_length(input_video):
9
+ result = subprocess.run(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', input_video], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
10
+ return float(result.stdout)
11
+
12
+ def read_Vox_lines(file):
13
+ Tlines, Flines = [], []
14
+ with open(file) as f_in:
15
+ while True:
16
+ line = f_in.readline()
17
+ if not line:
18
+ break
19
+ if int(line[0]):
20
+ Tlines.append(line)
21
+ else:
22
+ Flines.append(line)
23
+ return Tlines, Flines
24
+
25
+ def read_LRS3_ST(file):
26
+ lines = []
27
+ with open(file) as f_in:
28
+ while True:
29
+ line = f_in.readline()
30
+ if not line:
31
+ break
32
+ lines.append(line)
33
+ return lines[:30000]
34
+
35
+ def read_LRS3_S(file):
36
+ lines = []
37
+ with open(file) as f_in:
38
+ while True:
39
+ line = f_in.readline()
40
+ if not line:
41
+ break
42
+ start = int(line.split()[1]) / 100
43
+ end = int(line.split()[2]) / 100
44
+ if end - start <= 3: # Only select less than 3s
45
+ lines.append(line)
46
+ return lines[:30000]
47
+
48
+ def generate_TAudio(line, args):
49
+ # Get the id of the audio and video
50
+ audio_name = line.split()[1][:-4]
51
+ video_name = line.split()[2][:-4]
52
+ id1 = audio_name.split('/')[0]
53
+ name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
54
+ name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
55
+ name = name1 + '_' + name2
56
+ audio_path = os.path.join(args.Vox_audio, audio_name + '.wav')
57
+ video_path = os.path.join(args.Vox_video, video_name + '.mp4')
58
+ out_audio_path = os.path.join(args.out_path, 'TAudio', id1 + '/' + name + '.wav')
59
+ out_video_path = os.path.join(args.out_path, 'TAudio', id1 + '/' + name + '.mp4')
60
+ os.makedirs(os.path.join(args.out_path, 'TAudio', id1), exist_ok = True)
61
+
62
+ # Read the audio data and the length of audio and video
63
+ audio = AudioSegment.from_file(audio_path, format="wav")
64
+ length_audio = len(audio) / 1000.0
65
+ length_video = get_length(video_path)
66
+ length_data = int(min(length_video, length_audio) * 100) / 100
67
+ audio = audio[:int(length_data * 1000)]
68
+
69
+ # Extract the video and audio
70
+ start = 0
71
+ end = length_data
72
+ audio.export(out_audio_path, format="wav")
73
+ cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
74
+ subprocess.call(cmd, shell=True, stdout=None)
75
+
76
+ # # Write the txt file
77
+ start_T, end_T = 0, length_data
78
+ start_F, end_F= 0, 0
79
+ line_new = "TAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
80
+ + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
81
+ return line_new
82
+
83
+ def generate_FAudio(line, args):
84
+ # Get the id of the audio and video
85
+ audio_name = line.split()[1][:-4]
86
+ video_name = line.split()[2][:-4]
87
+ id1 = audio_name.split('/')[0]
88
+ name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
89
+ name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
90
+ name = name1 + '_' + name2
91
+ audio_path = os.path.join(args.Vox_audio, audio_name + '.wav')
92
+ video_path = os.path.join(args.Vox_video, video_name + '.mp4')
93
+ out_audio_path = os.path.join(args.out_path, 'FAudio', id1 + '/' + name + '.wav')
94
+ out_video_path = os.path.join(args.out_path, 'FAudio', id1 + '/' + name + '.mp4')
95
+ os.makedirs(os.path.join(args.out_path, 'FAudio', id1), exist_ok = True)
96
+
97
+ # Read the audio data and the length of audio and video
98
+ audio = AudioSegment.from_file(audio_path, format="wav")
99
+ length_audio = len(audio) / 1000.0
100
+ length_video = get_length(video_path)
101
+ length_data = int(min(length_video, length_audio) * 100) / 100
102
+ audio = audio[:int(length_data * 1000)]
103
+
104
+ # Extract the video and audio
105
+ start = 0
106
+ end = length_data
107
+ audio.export(out_audio_path, format="wav")
108
+ cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
109
+ subprocess.call(cmd, shell=True, stdout=None)
110
+
111
+ # Write the txt file
112
+ start_T, end_T = 0, 0
113
+ start_F, end_F= 0, length_data
114
+ line_new = "FAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
115
+ + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
116
+ return line_new
117
+
118
+ def generate_TFAudio(line, args):
119
+ # Get the id of the audio and video
120
+ audio_name = line.split()[1][:-4]
121
+ video_name = line.split()[2][:-4]
122
+ id1 = audio_name.split('/')[0]
123
+ name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2]
124
+ name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2]
125
+ name = name1 + '_' + name2
126
+ audio_T_path = os.path.join(args.Vox_audio, video_name + '.wav')
127
+ audio_F_path = os.path.join(args.Vox_audio, audio_name + '.wav')
128
+ video_path = os.path.join(args.Vox_video, video_name + '.mp4')
129
+ out_audio_path = os.path.join(args.out_path, 'TFAudio', id1 + '/' + name + '.wav')
130
+ out_video_path = os.path.join(args.out_path, 'TFAudio', id1 + '/' + name + '.mp4')
131
+ os.makedirs(os.path.join(args.out_path, 'TFAudio', id1), exist_ok = True)
132
+
133
+ # Read the audio data and the length of audio and video
134
+ audio_T = AudioSegment.from_file(audio_T_path, format="wav")
135
+ audio_F = AudioSegment.from_file(audio_F_path, format="wav")
136
+ length_audio_T = len(audio_T) / 1000.0
137
+ length_audio_F = len(audio_F) / 1000.0
138
+ length_video = get_length(video_path)
139
+ length_data = int(min(length_audio_T, length_audio_F, length_video) * 100) / 100
140
+ audio_T = audio_T[:int(length_data * 1000)]
141
+ audio_F = audio_F[:int(length_data * 1000)]
142
+
143
+ # Generate the audio
144
+ changepoint = int((length_data * 0.25 + length_data * random.random() * 0.5) * 100) / 100
145
+ audio_dict = {}
146
+ audio_dict['T1'] = audio_T[:changepoint * 1000]
147
+ audio_dict['T2'] = audio_T[changepoint * 1000:]
148
+ audio_dict['F1'] = audio_F[:changepoint * 1000]
149
+ audio_dict['F2'] = audio_F[changepoint * 1000:]
150
+ seed = random.randint(0,1)
151
+ if seed == 1:
152
+ audio = audio_dict['T1'] + audio_dict['F2']
153
+ else:
154
+ audio = audio_dict['F1'] + audio_dict['T2']
155
+ # Extract the video and audio
156
+ start = 0
157
+ end = length_data
158
+ audio.export(out_audio_path, format="wav")
159
+ cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
160
+ subprocess.call(cmd, shell=True, stdout=None)
161
+
162
+ # Write the txt file
163
+ if seed == 1:
164
+ start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
165
+ elif seed == 0:
166
+ start_F, end_F, start_T, end_T = 0, changepoint, changepoint, length_data
167
+ line_new = "TFAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
168
+ + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
169
+ return line_new
170
+
171
+ def generate_TSilence(line, args):
172
+ # Get the id of the audio and video
173
+ type_change = line.split()[0]
174
+ audio_name = line.split()[1]
175
+ video_name = line.split()[1]
176
+ id1 = audio_name.split('/')[0]
177
+ name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + line.split()[5]
178
+ name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + line.split()[5]
179
+ name = name1 + '_' + name2
180
+ start = int(line.split()[2]) / 100
181
+ mid = int(line.split()[3]) / 100
182
+ end = int(line.split()[4]) / 100
183
+ audio_path = os.path.join(args.lrs3_audio, 'pretrain', audio_name[8:] + '.wav')
184
+ video_path = os.path.join(args.lrs3_video, 'pretrain', video_name[8:]+ '.mp4')
185
+ out_audio_path = os.path.join(args.out_path, 'TSilence', id1 + '/' + name + '.wav')
186
+ out_video_path = os.path.join(args.out_path, 'TSilence', id1 + '/' + name + '.mp4')
187
+ os.makedirs(os.path.join(os.path.join(args.out_path, 'TSilence'), id1), exist_ok = True)
188
+
189
+ # Read the audio data and the length of audio and video
190
+ audio = AudioSegment.from_file(audio_path, format="wav")
191
+
192
+ # Get the required audio and video data
193
+ length_data = int((end - start) * 100) / 100
194
+ audio = audio[int(start * 1000):int(end * 1000)]
195
+ audio.export(out_audio_path, format="wav")
196
+ cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
197
+ subprocess.call(cmd, shell=True, stdout=None)
198
+
199
+ changepoint = int((mid - start) * 100) / 100
200
+ if type_change == "10":
201
+ start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
202
+ elif type_change == "01":
203
+ start_T, end_T, start_F, end_F = changepoint, length_data, 0, changepoint
204
+
205
+ audio_name = audio_name[:-5] + line.split()[5]
206
+ video_name = video_name[:-5] + line.split()[5]
207
+ line_new = "TSilence" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \
208
+ + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
209
+ return line_new
210
+
211
+ def generate_FSilence(line, Flines, args):
212
+ # Get the id of the audio and video
213
+ audio_T_name = line.split()[0]
214
+ video_name = line.split()[0]
215
+ start = int(line.split()[1]) / 100
216
+ end = int(line.split()[2]) / 100
217
+ length_data = int((end - start) * 100) / 100
218
+ changepoint = int((length_data * 0.25 + length_data * random.random() * 0.5) * 100) / 100
219
+ speech_line = random.choice(Flines)
220
+ length_speech = float(speech_line.split()[-1])
221
+ while length_speech < length_data:
222
+ speech_line = random.choice(Flines)
223
+ length_speech = float(speech_line.split()[-1])
224
+ audio_F_name = speech_line.split()[1][:-4]
225
+ id1 = audio_F_name.split('/')[0]
226
+ name1 = audio_F_name.split('/')[0] + '_' + audio_F_name.split('/')[1] + '_' + audio_F_name.split('/')[2]
227
+ name2 = audio_T_name.split('/')[0] + '_' + audio_T_name.split('/')[1] + '_' + line.split()[-1]
228
+ name = name1 + '_' + name2
229
+
230
+ # True: orig_video False: speech+slience
231
+ video_path = os.path.join(args.lrs3_video, 'pretrain', video_name[8:]+ '.mp4')
232
+ audio_T_path = os.path.join(args.lrs3_audio, 'pretrain', audio_T_name[8:] + '.wav')
233
+ audio_F_path = os.path.join(args.Vox_audio, audio_F_name + '.wav')
234
+ out_audio_path = os.path.join(args.out_path, 'FSilence', id1 + '/' + name + '.wav')
235
+ out_video_path = os.path.join(args.out_path, 'FSilence', id1 + '/' + name + '.mp4')
236
+ os.makedirs(os.path.join(args.out_path, 'FSilence', id1), exist_ok = True)
237
+
238
+ # Read the audio data and the length of audio and video
239
+ audio_T = AudioSegment.from_file(audio_T_path, format="wav")
240
+ audio_T = audio_T[int(start * 1000):int(end * 1000)]
241
+ audio_F = AudioSegment.from_file(audio_F_path, format="wav")
242
+ length_audio_T = len(audio_T) / 1000.0
243
+ length_audio_F = len(audio_F) / 1000.0
244
+ length_video = get_length(video_path)
245
+ length_data = int(min(length_audio_T, length_audio_F, length_video) * 100) / 100
246
+ audio_T = audio_T[:int(length_data * 1000)]
247
+ audio_F = audio_F[:int(length_data * 1000)]
248
+
249
+ # Generate the audio
250
+ audio_dict = {}
251
+ audio_dict['T1'] = audio_T[:changepoint * 1000]
252
+ audio_dict['T2'] = audio_T[changepoint * 1000:]
253
+ audio_dict['F1'] = audio_F[:changepoint * 1000]
254
+ audio_dict['F2'] = audio_F[changepoint * 1000:]
255
+ seed = random.randint(0,1)
256
+ if seed == 1:
257
+ audio = audio_dict['T1'] + audio_dict['F2']
258
+ else:
259
+ audio = audio_dict['F1'] + audio_dict['T2']
260
+ # Extract the video and audio
261
+ audio.export(out_audio_path, format="wav")
262
+ cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path)
263
+ subprocess.call(cmd, shell=True, stdout=None)
264
+
265
+ # Write the txt file
266
+ if seed == 1:
267
+ start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data
268
+ elif seed == 0:
269
+ start_F, end_F, start_T, end_T = 0, changepoint, changepoint, length_data
270
+
271
+ video_name = video_name[:-5] + line.split()[-1]
272
+ line_new = "FSilence" + ' ' + str(audio_F_name) + ' ' + str(video_name) + ' ' + str(length_data) \
273
+ + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n'
274
+ return line_new
275
+
276
+ # MAIN
277
+ parser = argparse.ArgumentParser(description = "generate_Dataset")
278
+
279
+ parser.add_argument('--List_folder', type=str, default= 'lists')
280
+ parser.add_argument('--out_path', type=str, default= '/data07/ruijie/database/TalkSet_final')
281
+ parser.add_argument('--Vox_audio', type=str, default= '/home/ruijie/database/VoxCeleb2/audio/audio_clean/clean/train')
282
+ parser.add_argument('--Vox_video', type=str, default= '/home/ruijie/database/VoxCeleb2/video/orig/train')
283
+ parser.add_argument('--lrs3_audio', type=str, default='/data07/ruijie/database/LRS3/audio/orig_audio/clean')
284
+ parser.add_argument('--lrs3_video', type=str, default='/data07/ruijie/database/LRS3/video/orig_video')
285
+ parser.add_argument('--task', type=str, default='TAudio')
286
+ parser.add_argument('--num_cpu', type=int, default=10)
287
+ args = parser.parse_args()
288
+
289
+ os.makedirs(os.path.join(args.out_path, 'TAudio'), exist_ok = True)
290
+ os.makedirs(os.path.join(args.out_path, 'FAudio'), exist_ok = True)
291
+ os.makedirs(os.path.join(args.out_path, 'TFAudio'), exist_ok = True)
292
+ os.makedirs(os.path.join(args.out_path, 'FSilence'), exist_ok = True)
293
+ os.makedirs(os.path.join(args.out_path, 'TSilence'), exist_ok = True)
294
+
295
+ args.list_Vox = os.path.join(args.List_folder, 'lists_in', 'Vox_list.txt')
296
+ args.list_LRS3_S = os.path.join(args.List_folder, 'lists_in', 'LRS3_S_list.txt')
297
+ args.list_LRS3_ST = os.path.join(args.List_folder, 'lists_in', 'LRS3_ST_list.txt')
298
+ args.list_out = os.path.join(args.List_folder, 'lists_out')
299
+ args.list_out_train = os.path.join(args.list_out, 'train.txt')
300
+ args.list_out_test = os.path.join(args.list_out, 'test.txt')
301
+
302
+ if args.task == 'TAudio':
303
+ Tlines, _ = read_Vox_lines(args.list_Vox)
304
+ Tlines_new = []
305
+ # Generate the video and audio
306
+ with Pool(args.num_cpu) as p:
307
+ Tlines_new.append(p.starmap(generate_TAudio, zip(Tlines, repeat(args))))
308
+ # Write the txt file
309
+ out_Tlist_file = open(os.path.join(args.list_out, 'TAudio.txt'), "w")
310
+ for line_new in Tlines_new[0]:
311
+ out_Tlist_file.write(line_new)
312
+ print('TAudio Finish')
313
+
314
+ if args.task == 'FAudio':
315
+ _, Flines = read_Vox_lines(args.list_Vox)
316
+ Flines_new = []
317
+ # Generate the video and audio
318
+ with Pool(args.num_cpu) as p:
319
+ Flines_new.append(p.starmap(generate_FAudio, zip(Flines, repeat(args))))
320
+
321
+ # Write the txt file
322
+ out_Flist_file = open(os.path.join(args.list_out, 'FAudio.txt'), "w")
323
+ for line_new in Flines_new[0]:
324
+ out_Flist_file.write(line_new)
325
+ print('FAudio Finish')
326
+
327
+ if args.task == 'TFAudio':
328
+ _, Flines = read_Vox_lines(args.list_Vox)
329
+ TFlines_new = []
330
+ # Generate the video and audio
331
+ with Pool(args.num_cpu) as p:
332
+ TFlines_new.append(p.starmap(generate_TFAudio, zip(Flines, repeat(args))))
333
+
334
+ # Write the txt file
335
+ out_TFlist_file = open(os.path.join(args.list_out, 'TFAudio.txt'), "w")
336
+ for line_new in TFlines_new[0]:
337
+ out_TFlist_file.write(line_new)
338
+ print('TFAudio Finish')
339
+
340
+ if args.task == 'TSilence':
341
+ Slines = read_LRS3_ST(args.list_LRS3_ST)
342
+ TSlines_new = []
343
+ with Pool(args.num_cpu) as p:
344
+ TSlines_new.append(p.starmap(generate_TSilence, zip(Slines, repeat(args))))
345
+
346
+ # Write the txt file
347
+ out_TSlist_file = open(os.path.join(args.list_out, 'TSilence.txt'), "w")
348
+ for line_new in TSlines_new[0]:
349
+ out_TSlist_file.write(line_new)
350
+ print('TSilence Finish')
351
+
352
+ if args.task == 'FSilence':
353
+ Tlines, _ = read_Vox_lines(args.list_Vox)
354
+ Slines = read_LRS3_S(args.list_LRS3_S)
355
+ FSlines_new = []
356
+ with Pool(args.num_cpu) as p:
357
+ FSlines_new.append(p.starmap(generate_FSilence, zip(Slines, repeat(Tlines), repeat(args))))
358
+
359
+ out_FSlist_file = open(os.path.join(args.list_out, 'FSilence.txt'), "w")
360
+ for line_new in FSlines_new[0]:
361
+ out_FSlist_file.write(line_new)
362
+ print('FSilence Finish')
363
+
364
+ if args.task == 'Fusion':
365
+ lines = []
366
+ for name in {'TAudio', 'FAudio', 'TFAudio', 'TSilence', 'FSilence'}:
367
+ with open(args.list_out + '/' + name + '.txt') as f:
368
+ while True:
369
+ line = f.readline()
370
+ if not line:
371
+ break
372
+ lines.append(line)
373
+ train_file = open(args.list_out_train, "w")
374
+ test_file = open(args.list_out_test, "w")
375
+ random.shuffle(lines)
376
+ for num, line in enumerate(lines):
377
+ data = line.split()
378
+ if float(data[3]) > 6: # For the data longer than 6s, we cut them into 6s, so that will make training process simple.
379
+ line = str(data[0]) + ' ' + str(data[1]) + ' ' + str(data[2]) + ' ' + \
380
+ str(min(float(data[3]), 6)) + ' ' + str(min(float(data[4]), 6)) + ' ' + \
381
+ str(min(float(data[5]), 6)) + ' ' + str(min(float(data[6]), 6)) + ' ' + \
382
+ str(min(float(data[7]), 6)) + ' ' + "%06d"%int(num) + '\n'
383
+ else:
384
+ line = str(data[0]) + ' ' + str(data[1]) + ' ' + str(data[2]) + ' ' + \
385
+ str(data[3]) + ' ' + str(data[4]) + ' ' + \
386
+ str(data[5]) + ' ' + str(data[6]) + ' ' + \
387
+ str(data[7]) + ' ' + "%06d"%int(num) + '\n'
388
+ if num % 30000 < 27000:
389
+ train_file.write(line)
390
+ else:
391
+ test_file.write(line)
talknet-asd/awesomeASD.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Related Work for Active Speaker Detection
2
+
3
+ ---
4
+ ### Research Paper In **AVA-ActiveSpeaker Dataset**
5
+
6
+ - Roth J, Chaudhuri S, Klejch O, et al. Ava active speaker: [An audio-visual dataset for active speaker detection](https://arxiv.org/pdf/1901.01342.pdf), ICASSP, 2020.
7
+ - Sharma R, Somandepalli K, Narayanan S. [Crossmodal learning for audio-visual speech event localization](https://arxiv.org/pdf/2003.04358.pdf), arXiv preprint, 2020.
8
+ - Alcázar J L, Caba F, Mai L, et al. [Active speakers in context](https://openaccess.thecvf.com/content_CVPR_2020/papers/Alcazar_Active_Speakers_in_Context_CVPR_2020_paper.pdf) , CVPR, 2020.
9
+ - León-Alcázar J, Heilbron F C, Thabet A, et al. [MAAS: Multi-modal Assignation for Active Speaker Detection](https://arxiv.org/pdf/2101.03682.pdf), arXiv preprint, 2021.
10
+ - Huang C, Koishida K. [Improved Active Speaker Detection based on Optical Flow](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w56/Huang_Improved_Active_Speaker_Detection_Based_on_Optical_Flow_CVPRW_2020_paper.pdf), CVPR Workshops, 2020
11
+ - Assunção G, Gonçalves N, Menezes P. [Bio-Inspired Modality Fusion for Active Speaker Detection](https://www.mdpi.com/2076-3417/11/8/3397/pdf), Applied Sciences, 2021
12
+ - Pouthier B, Pilati L, Gudupudi L K, et al. [Active Speaker Detection as a Multi-Objective Optimization with Uncertainty-based Multimodal Fusion](https://arxiv.org/pdf/2106.03821.pdf), arXiv preprint, 2021
13
+ - Köpüklü O, Taseska M, Rigoll G. [How to Design a Three-Stage Architecture for Audio-Visual Active Speaker Detection in the Wild](https://arxiv.org/pdf/2106.03932.pdf), arVix preprint, 2021
14
+ - Ruijie Tao, Zexu Pan, Rohan Kumar Das, Xinyuan Qian, Mike Zheng Shou, Haizhou Li. [Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual Active Speaker Detection](https://arxiv.org/pdf/2107.06592.pdf), ACM Multimedia (MM), 2021
15
+ - Yuanhang Zhang, Susan Liang, Shuang Yang, Xiao Liu, Zhongqin Wu, Shiguang Shan, Xilin Chen. [UniCon: Unified Context Network for Robust Active Speaker
16
+ Detection](https://arxiv.org/pdf/2108.02607.pdf), ACM Multimedia (MM), 2021
17
+
18
+
19
+ ### Research Report In **AVA-ActiveSpeaker Dataset for AVA-Activity Challenge**
20
+ - Chung J S. [Naver at ActivityNet Challenge 2019--Task B Active Speaker Detection (AVA)](https://arxiv.org/pdf/1906.10555.pdf), 2019.
21
+ - Zhang Y H, Xiao J, Yang S, et al. [Multi-Task Learning for Audio-Visual Active Speaker Detection](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2019/Multi_Task_Learning_for_Audio_Visual_Active_Speaker_Detection.pdf), 2019
22
+ - Alcázar J L, Caba F, Mai L, et al. [Universidad de los Andes at ActivityNet Challenge 2020 - Task B Active Speaker
23
+ Detection (AVA)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2020/ASC_AN_report.pdf), 2020
24
+ - Köpüklü O, Taseska M, Rigoll G. [ASDNet at ActivityNet Challenge 2021-Active Speaker Detection (AVA)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S2_ActivityNet_Report_ASDNet.pdf), 2021
25
+ - Zhang Y, Liang S, Yang S, et al. [ICTCAS-UCAS-TAL Submission to the AVA-ActiveSpeaker Task at ActivityNet Challenge 2021](http://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S1_ICTCAS-UCAS-TAL.pdf), 2021
26
+ - Tao R, Pan Z, Das R K, et al. [NUS-HLT Report for ActivityNet Challenge 2021 AVA (Speaker)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S3_NUS_Report_AVA_ActiveSpeaker_2021.pdf), 2021
27
+
28
+ ### Research Paper In **Columnbia Active Speaker Detection Dataset**
29
+ - Chakravarty P, Tuytelaars T. [Cross-modal supervision for learning active speaker detection in video](https://arxiv.org/pdf/1603.08907.pdf), ECCV, 2016
30
+ - Chung J S, Zisserman A. [Out of time: automated lip sync in the wild](https://www.robots.ox.ac.uk/~vgg/publications/2016/Chung16a/chung16a.pdf), ECCV, 2016
31
+ - Shahid M, Beyan C, Murino V. [Voice activity detection by upper body motion analysis and unsupervised domain adaptation](https://openaccess.thecvf.com/content_ICCVW_2019/papers/HBU/Shahid_Voice_Activity_Detection_by_Upper_Body_Motion_Analysis_and_Unsupervised_ICCVW_2019_paper.pdf), ICCV Workshops, 2019
32
+ - Afouras T, Owens A, Chung J S, et al. [Self-supervised learning of audio-visual objects from video](https://arxiv.org/pdf/2008.04237.pdf), ECCV, 2020
33
+ - Shahid M, Beyan C, Murino V. [Comparisons of visual activity primitives for voice activity detection](https://www.researchgate.net/profile/Cigdem-Beyan/publication/335604556_Comparisons_of_Visual_Activity_Primitives_for_Voice_Activity_Detection/links/5fa19074a6fdccfd7b97c0f5/Comparisons-of-Visual-Activity-Primitives-for-Voice-Activity-Detection.pdf), ICIAP, 2019
34
+ - Shahid M, Beyan C, Murino V. [S-VVAD: Visual Voice Activity Detection by Motion](https://www.researchgate.net/profile/Cigdem-Beyan/publication/348279893_S-VVAD_Visual_Voice_Activity_Detection_by_Motion_Segmentation/links/5ff60482299bf14088786cc1/S-VVAD-Visual-Voice-Activity-Detection-by-Motion-Segmentation.pdf), WACV, 2021
35
+ - Beyan C, Shahid M, Murino V. [RealVAD: A real-world dataset and a method for voice activity detection by body motion analysis](https://ieeexplore.ieee.org/document/9133504), IEEE Transactions on Multimedia, 2020.
36
+
37
+ ### Other Paper for Active Speaker Detection
38
+ - Kim You Jin and Heo Hee-Soo, Soyeon Choe, et al. [Look Who’s Talking: Active Speaker Detection in the Wild](https://arxiv.org/pdf/2108.07640.pdf), Interspeech, 2021
talknet-asd/cog.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration for Cog ⚙️
2
+ # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
3
+
4
+ build:
5
+ # set to true if your model requires a GPU
6
+ gpu: true
7
+
8
+ # a list of ubuntu apt packages to install
9
+ system_packages:
10
+ - "libgl1-mesa-glx"
11
+ - "ffmpeg"
12
+ # - "libglib2.0-0"
13
+
14
+ # python version in the form '3.11' or '3.11.4'
15
+ python_version: "3.8"
16
+
17
+ # a list of packages in the format <package-name>==<version>
18
+ python_packages:
19
+ - "torch>=1.6.0"
20
+ - "torchaudio>=0.6.0"
21
+ - "numpy"
22
+ - "scipy"
23
+ - "scikit-learn"
24
+ - "tqdm"
25
+ - "scenedetect"
26
+ - "opencv-python"
27
+ - "python_speech_features"
28
+ - "torchvision"
29
+ - "ffmpeg"
30
+ - "gdown"
31
+ - "youtube-dl"
32
+ - "pandas"
33
+
34
+ # commands run after the environment is setup
35
+ # run:
36
+ # - "echo env is ready!"
37
+ # - "echo another command if needed"
38
+
39
+ # predict.py defines how predictions are run on your model
40
+ predict: "predict.py:Predictor"
talknet-asd/dataLoader.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, torch, numpy, cv2, random, glob, python_speech_features
2
+ from scipy.io import wavfile
3
+ from torchvision.transforms import RandomCrop
4
+
5
+ def generate_audio_set(dataPath, batchList):
6
+ audioSet = {}
7
+ for line in batchList:
8
+ data = line.split('\t')
9
+ videoName = data[0][:11]
10
+ dataName = data[0]
11
+ _, audio = wavfile.read(os.path.join(dataPath, videoName, dataName + '.wav'))
12
+ audioSet[dataName] = audio
13
+ return audioSet
14
+
15
+ def overlap(dataName, audio, audioSet):
16
+ noiseName = random.sample(set(list(audioSet.keys())) - {dataName}, 1)[0]
17
+ noiseAudio = audioSet[noiseName]
18
+ snr = [random.uniform(-5, 5)]
19
+ if len(noiseAudio) < len(audio):
20
+ shortage = len(audio) - len(noiseAudio)
21
+ noiseAudio = numpy.pad(noiseAudio, (0, shortage), 'wrap')
22
+ else:
23
+ noiseAudio = noiseAudio[:len(audio)]
24
+ noiseDB = 10 * numpy.log10(numpy.mean(abs(noiseAudio ** 2)) + 1e-4)
25
+ cleanDB = 10 * numpy.log10(numpy.mean(abs(audio ** 2)) + 1e-4)
26
+ noiseAudio = numpy.sqrt(10 ** ((cleanDB - noiseDB - snr) / 10)) * noiseAudio
27
+ audio = audio + noiseAudio
28
+ return audio.astype(numpy.int16)
29
+
30
+ def load_audio(data, dataPath, numFrames, audioAug, audioSet = None):
31
+ dataName = data[0]
32
+ fps = float(data[2])
33
+ audio = audioSet[dataName]
34
+ if audioAug == True:
35
+ augType = random.randint(0,1)
36
+ if augType == 1:
37
+ audio = overlap(dataName, audio, audioSet)
38
+ else:
39
+ audio = audio
40
+ # fps is not always 25, in order to align the visual, we modify the window and step in MFCC extraction process based on fps
41
+ audio = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025 * 25 / fps, winstep = 0.010 * 25 / fps)
42
+ maxAudio = int(numFrames * 4)
43
+ if audio.shape[0] < maxAudio:
44
+ shortage = maxAudio - audio.shape[0]
45
+ audio = numpy.pad(audio, ((0, shortage), (0,0)), 'wrap')
46
+ audio = audio[:int(round(numFrames * 4)),:]
47
+ return audio
48
+
49
+ def load_visual(data, dataPath, numFrames, visualAug):
50
+ dataName = data[0]
51
+ videoName = data[0][:11]
52
+ faceFolderPath = os.path.join(dataPath, videoName, dataName)
53
+ faceFiles = glob.glob("%s/*.jpg"%faceFolderPath)
54
+ sortedFaceFiles = sorted(faceFiles, key=lambda data: (float(data.split('/')[-1][:-4])), reverse=False)
55
+ faces = []
56
+ H = 112
57
+ if visualAug == True:
58
+ new = int(H*random.uniform(0.7, 1))
59
+ x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new)
60
+ M = cv2.getRotationMatrix2D((H/2,H/2), random.uniform(-15, 15), 1)
61
+ augType = random.choice(['orig', 'flip', 'crop', 'rotate'])
62
+ else:
63
+ augType = 'orig'
64
+ for faceFile in sortedFaceFiles[:numFrames]:
65
+ face = cv2.imread(faceFile)
66
+ face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
67
+ face = cv2.resize(face, (H,H))
68
+ if augType == 'orig':
69
+ faces.append(face)
70
+ elif augType == 'flip':
71
+ faces.append(cv2.flip(face, 1))
72
+ elif augType == 'crop':
73
+ faces.append(cv2.resize(face[y:y+new, x:x+new] , (H,H)))
74
+ elif augType == 'rotate':
75
+ faces.append(cv2.warpAffine(face, M, (H,H)))
76
+ faces = numpy.array(faces)
77
+ return faces
78
+
79
+
80
+ def load_label(data, numFrames):
81
+ res = []
82
+ labels = data[3].replace('[', '').replace(']', '')
83
+ labels = labels.split(',')
84
+ for label in labels:
85
+ res.append(int(label))
86
+ res = numpy.array(res[:numFrames])
87
+ return res
88
+
89
+ class train_loader(object):
90
+ def __init__(self, trialFileName, audioPath, visualPath, batchSize, **kwargs):
91
+ self.audioPath = audioPath
92
+ self.visualPath = visualPath
93
+ self.miniBatch = []
94
+ mixLst = open(trialFileName).read().splitlines()
95
+ # sort the training set by the length of the videos, shuffle them to make more videos in the same batch belong to different movies
96
+ sortedMixLst = sorted(mixLst, key=lambda data: (int(data.split('\t')[1]), int(data.split('\t')[-1])), reverse=True)
97
+ start = 0
98
+ while True:
99
+ length = int(sortedMixLst[start].split('\t')[1])
100
+ end = min(len(sortedMixLst), start + max(int(batchSize / length), 1))
101
+ self.miniBatch.append(sortedMixLst[start:end])
102
+ if end == len(sortedMixLst):
103
+ break
104
+ start = end
105
+
106
+ def __getitem__(self, index):
107
+ batchList = self.miniBatch[index]
108
+ numFrames = int(batchList[-1].split('\t')[1])
109
+ audioFeatures, visualFeatures, labels = [], [], []
110
+ audioSet = generate_audio_set(self.audioPath, batchList) # load the audios in this batch to do augmentation
111
+ for line in batchList:
112
+ data = line.split('\t')
113
+ audioFeatures.append(load_audio(data, self.audioPath, numFrames, audioAug = True, audioSet = audioSet))
114
+ visualFeatures.append(load_visual(data, self.visualPath,numFrames, visualAug = True))
115
+ labels.append(load_label(data, numFrames))
116
+ return torch.FloatTensor(numpy.array(audioFeatures)), \
117
+ torch.FloatTensor(numpy.array(visualFeatures)), \
118
+ torch.LongTensor(numpy.array(labels))
119
+
120
+ def __len__(self):
121
+ return len(self.miniBatch)
122
+
123
+
124
+ class val_loader(object):
125
+ def __init__(self, trialFileName, audioPath, visualPath, **kwargs):
126
+ self.audioPath = audioPath
127
+ self.visualPath = visualPath
128
+ self.miniBatch = open(trialFileName).read().splitlines()
129
+
130
+ def __getitem__(self, index):
131
+ line = [self.miniBatch[index]]
132
+ numFrames = int(line[0].split('\t')[1])
133
+ audioSet = generate_audio_set(self.audioPath, line)
134
+ data = line[0].split('\t')
135
+ audioFeatures = [load_audio(data, self.audioPath, numFrames, audioAug = False, audioSet = audioSet)]
136
+ visualFeatures = [load_visual(data, self.visualPath,numFrames, visualAug = False)]
137
+ labels = [load_label(data, numFrames)]
138
+ return torch.FloatTensor(numpy.array(audioFeatures)), \
139
+ torch.FloatTensor(numpy.array(visualFeatures)), \
140
+ torch.LongTensor(numpy.array(labels))
141
+
142
+ def __len__(self):
143
+ return len(self.miniBatch)
talknet-asd/demoTalkNet.py ADDED
@@ -0,0 +1,686 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features
2
+
3
+ from scipy import signal
4
+ from shutil import rmtree
5
+ from scipy.io import wavfile
6
+ from scipy.interpolate import interp1d
7
+ from sklearn.metrics import accuracy_score, f1_score
8
+
9
+ from scenedetect.video_manager import VideoManager
10
+ from scenedetect.scene_manager import SceneManager
11
+ from scenedetect.frame_timecode import FrameTimecode
12
+ from scenedetect.stats_manager import StatsManager
13
+ from scenedetect.detectors import ContentDetector
14
+
15
+ from model.faceDetector.s3fd import S3FD
16
+ from talkNet import talkNet
17
+
18
+ warnings.filterwarnings("ignore")
19
+
20
+ parser = argparse.ArgumentParser(description="TalkNet Demo or Columnbia ASD Evaluation")
21
+
22
+ parser.add_argument("--videoName", type=str, default="001", help="Demo video name")
23
+ parser.add_argument(
24
+ "--videoFolder", type=str, default="demo", help="Path for inputs, tmps and outputs"
25
+ )
26
+ parser.add_argument(
27
+ "--pretrainModel",
28
+ type=str,
29
+ default="pretrain_TalkSet.model",
30
+ help="Path for the pretrained TalkNet model",
31
+ )
32
+
33
+ parser.add_argument(
34
+ "--nDataLoaderThread", type=int, default=10, help="Number of workers"
35
+ )
36
+ parser.add_argument(
37
+ "--facedetScale",
38
+ type=float,
39
+ default=0.25,
40
+ help="Scale factor for face detection, the frames will be scale to 0.25 orig",
41
+ )
42
+ parser.add_argument(
43
+ "--minTrack", type=int, default=10, help="Number of min frames for each shot"
44
+ )
45
+ parser.add_argument(
46
+ "--numFailedDet",
47
+ type=int,
48
+ default=10,
49
+ help="Number of missed detections allowed before tracking is stopped",
50
+ )
51
+ parser.add_argument(
52
+ "--minFaceSize", type=int, default=1, help="Minimum face size in pixels"
53
+ )
54
+ parser.add_argument("--cropScale", type=float, default=0.40, help="Scale bounding box")
55
+
56
+ parser.add_argument("--start", type=int, default=0, help="The start time of the video")
57
+ parser.add_argument(
58
+ "--duration",
59
+ type=int,
60
+ default=0,
61
+ help="The duration of the video, when set as 0, will extract the whole video",
62
+ )
63
+
64
+ parser.add_argument(
65
+ "--evalCol",
66
+ dest="evalCol",
67
+ action="store_true",
68
+ help="Evaluate on Columnbia dataset",
69
+ )
70
+ parser.add_argument(
71
+ "--colSavePath",
72
+ type=str,
73
+ default="/data08/col",
74
+ help="Path for inputs, tmps and outputs",
75
+ )
76
+
77
+ args = parser.parse_args()
78
+
79
+ if os.path.isfile(args.pretrainModel) == False: # Download the pretrained model
80
+ Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea"
81
+ cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel)
82
+ subprocess.call(cmd, shell=True, stdout=None)
83
+
84
+ if args.evalCol == True:
85
+ # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using)
86
+ # 2. extract audio, extract video frames
87
+ # 3. scend detection, face detection and face tracking
88
+ # 4. active speaker detection for the detected face clips
89
+ # 5. use iou to find the identity of each face clips, compute the F1 results
90
+ # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour
91
+ # The step 4 and 5 need less than 10 minutes
92
+ # Need about 20G space finally
93
+ # ```
94
+ args.videoName = "col"
95
+ args.videoFolder = args.colSavePath
96
+ args.savePath = os.path.join(args.videoFolder, args.videoName)
97
+ args.videoPath = os.path.join(args.videoFolder, args.videoName + ".mp4")
98
+ args.duration = 0
99
+ if os.path.isfile(args.videoPath) == False: # Download video
100
+ link = "https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s"
101
+ cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link)
102
+ output = subprocess.call(cmd, shell=True, stdout=None)
103
+ if os.path.isdir(args.videoFolder + "/col_labels") == False: # Download label
104
+ link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv"
105
+ cmd = "gdown --id %s -O %s" % (link, args.videoFolder + "/col_labels.tar.gz")
106
+ subprocess.call(cmd, shell=True, stdout=None)
107
+ cmd = "tar -xzvf %s -C %s" % (
108
+ args.videoFolder + "/col_labels.tar.gz",
109
+ args.videoFolder,
110
+ )
111
+ subprocess.call(cmd, shell=True, stdout=None)
112
+ os.remove(args.videoFolder + "/col_labels.tar.gz")
113
+ else:
114
+ args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + ".*"))[0]
115
+ args.savePath = os.path.join(args.videoFolder, args.videoName)
116
+
117
+
118
+ def scene_detect(args):
119
+ # CPU: Scene detection, output is the list of each shot's time duration
120
+ videoManager = VideoManager([args.videoFilePath])
121
+ statsManager = StatsManager()
122
+ sceneManager = SceneManager(statsManager)
123
+ sceneManager.add_detector(ContentDetector())
124
+ baseTimecode = videoManager.get_base_timecode()
125
+ videoManager.set_downscale_factor()
126
+ videoManager.start()
127
+ sceneManager.detect_scenes(frame_source=videoManager)
128
+ sceneList = sceneManager.get_scene_list(baseTimecode)
129
+ savePath = os.path.join(args.pyworkPath, "scene.pckl")
130
+ if sceneList == []:
131
+ sceneList = [
132
+ (videoManager.get_base_timecode(), videoManager.get_current_timecode())
133
+ ]
134
+ with open(savePath, "wb") as fil:
135
+ pickle.dump(sceneList, fil)
136
+ sys.stderr.write(
137
+ "%s - scenes detected %d\n" % (args.videoFilePath, len(sceneList))
138
+ )
139
+ return sceneList
140
+
141
+
142
+ def inference_video(args):
143
+ # GPU: Face detection, output is the list contains the face location and score in this frame
144
+ DET = S3FD(device="cuda")
145
+ flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
146
+ flist.sort()
147
+ dets = []
148
+ for fidx, fname in enumerate(flist):
149
+ image = cv2.imread(fname)
150
+ imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
151
+ bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale])
152
+ dets.append([])
153
+ for bbox in bboxes:
154
+ dets[-1].append(
155
+ {"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]}
156
+ ) # dets has the frames info, bbox info, conf info
157
+ sys.stderr.write(
158
+ "%s-%05d; %d dets\r" % (args.videoFilePath, fidx, len(dets[-1]))
159
+ )
160
+ savePath = os.path.join(args.pyworkPath, "faces.pckl")
161
+ with open(savePath, "wb") as fil:
162
+ pickle.dump(dets, fil)
163
+ return dets
164
+
165
+
166
+ def bb_intersection_over_union(boxA, boxB, evalCol=False):
167
+ # CPU: IOU Function to calculate overlap between two image
168
+ xA = max(boxA[0], boxB[0])
169
+ yA = max(boxA[1], boxB[1])
170
+ xB = min(boxA[2], boxB[2])
171
+ yB = min(boxA[3], boxB[3])
172
+ interArea = max(0, xB - xA) * max(0, yB - yA)
173
+ boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
174
+ boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
175
+ if evalCol == True:
176
+ iou = interArea / float(boxAArea)
177
+ else:
178
+ iou = interArea / float(boxAArea + boxBArea - interArea)
179
+ return iou
180
+
181
+
182
+ def track_shot(args, sceneFaces):
183
+ # CPU: Face tracking
184
+ iouThres = 0.5 # Minimum IOU between consecutive face detections
185
+ tracks = []
186
+ while True:
187
+ track = []
188
+ for frameFaces in sceneFaces:
189
+ for face in frameFaces:
190
+ if track == []:
191
+ track.append(face)
192
+ frameFaces.remove(face)
193
+ elif face["frame"] - track[-1]["frame"] <= args.numFailedDet:
194
+ iou = bb_intersection_over_union(face["bbox"], track[-1]["bbox"])
195
+ if iou > iouThres:
196
+ track.append(face)
197
+ frameFaces.remove(face)
198
+ continue
199
+ else:
200
+ break
201
+ if track == []:
202
+ break
203
+ elif len(track) > args.minTrack:
204
+ frameNum = numpy.array([f["frame"] for f in track])
205
+ bboxes = numpy.array([numpy.array(f["bbox"]) for f in track])
206
+ frameI = numpy.arange(frameNum[0], frameNum[-1] + 1)
207
+ bboxesI = []
208
+ for ij in range(0, 4):
209
+ interpfn = interp1d(frameNum, bboxes[:, ij])
210
+ bboxesI.append(interpfn(frameI))
211
+ bboxesI = numpy.stack(bboxesI, axis=1)
212
+ if (
213
+ max(
214
+ numpy.mean(bboxesI[:, 2] - bboxesI[:, 0]),
215
+ numpy.mean(bboxesI[:, 3] - bboxesI[:, 1]),
216
+ )
217
+ > args.minFaceSize
218
+ ):
219
+ tracks.append({"frame": frameI, "bbox": bboxesI})
220
+ return tracks
221
+
222
+
223
+ def crop_video(args, track, cropFile):
224
+ # CPU: crop the face clips
225
+ flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) # Read the frames
226
+ flist.sort()
227
+ vOut = cv2.VideoWriter(
228
+ cropFile + "t.avi", cv2.VideoWriter_fourcc(*"XVID"), 25, (224, 224)
229
+ ) # Write video
230
+ dets = {"x": [], "y": [], "s": []}
231
+ for det in track["bbox"]: # Read the tracks
232
+ dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
233
+ dets["y"].append((det[1] + det[3]) / 2) # crop center x
234
+ dets["x"].append((det[0] + det[2]) / 2) # crop center y
235
+ dets["s"] = signal.medfilt(dets["s"], kernel_size=13) # Smooth detections
236
+ dets["x"] = signal.medfilt(dets["x"], kernel_size=13)
237
+ dets["y"] = signal.medfilt(dets["y"], kernel_size=13)
238
+ for fidx, frame in enumerate(track["frame"]):
239
+ cs = args.cropScale
240
+ bs = dets["s"][fidx] # Detection box size
241
+ bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount
242
+ image = cv2.imread(flist[frame])
243
+ frame = numpy.pad(
244
+ image,
245
+ ((bsi, bsi), (bsi, bsi), (0, 0)),
246
+ "constant",
247
+ constant_values=(110, 110),
248
+ )
249
+ my = dets["y"][fidx] + bsi # BBox center Y
250
+ mx = dets["x"][fidx] + bsi # BBox center X
251
+ face = frame[
252
+ int(my - bs) : int(my + bs * (1 + 2 * cs)),
253
+ int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs)),
254
+ ]
255
+ vOut.write(cv2.resize(face, (224, 224)))
256
+ audioTmp = cropFile + ".wav"
257
+ audioStart = (track["frame"][0]) / 25
258
+ audioEnd = (track["frame"][-1] + 1) / 25
259
+ vOut.release()
260
+ command = (
261
+ "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic"
262
+ % (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)
263
+ )
264
+ output = subprocess.call(command, shell=True, stdout=None) # Crop audio file
265
+ _, audio = wavfile.read(audioTmp)
266
+ command = (
267
+ "ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic"
268
+ % (cropFile, audioTmp, args.nDataLoaderThread, cropFile)
269
+ ) # Combine audio and video file
270
+ output = subprocess.call(command, shell=True, stdout=None)
271
+ os.remove(cropFile + "t.avi")
272
+ return {"track": track, "proc_track": dets}
273
+
274
+
275
+ def extract_MFCC(file, outPath):
276
+ # CPU: extract mfcc
277
+ sr, audio = wavfile.read(file)
278
+ mfcc = python_speech_features.mfcc(audio, sr) # (N_frames, 13) [1s = 100 frames]
279
+ featuresPath = os.path.join(outPath, file.split("/")[-1].replace(".wav", ".npy"))
280
+ numpy.save(featuresPath, mfcc)
281
+
282
+
283
+ def evaluate_network(files, args):
284
+ # GPU: active speaker detection by pretrained TalkNet
285
+ s = talkNet()
286
+ s.loadParameters(args.pretrainModel)
287
+ sys.stderr.write("Model %s loaded from previous state! \r\n" % args.pretrainModel)
288
+ s.eval()
289
+ allScores = []
290
+ # durationSet = {1,2,4,6} # To make the result more reliable
291
+ durationSet = {
292
+ 1,
293
+ 1,
294
+ 1,
295
+ 2,
296
+ 2,
297
+ 2,
298
+ 3,
299
+ 3,
300
+ 4,
301
+ 5,
302
+ 6,
303
+ } # Use this line can get more reliable result
304
+ for file in tqdm.tqdm(files, total=len(files)):
305
+ fileName = os.path.splitext(file.split("/")[-1])[0] # Load audio and video
306
+ _, audio = wavfile.read(os.path.join(args.pycropPath, fileName + ".wav"))
307
+ audioFeature = python_speech_features.mfcc(
308
+ audio, 16000, numcep=13, winlen=0.025, winstep=0.010
309
+ )
310
+ video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + ".avi"))
311
+ videoFeature = []
312
+ while video.isOpened():
313
+ ret, frames = video.read()
314
+ if ret == True:
315
+ face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
316
+ face = cv2.resize(face, (224, 224))
317
+ face = face[
318
+ int(112 - (112 / 2)) : int(112 + (112 / 2)),
319
+ int(112 - (112 / 2)) : int(112 + (112 / 2)),
320
+ ]
321
+ videoFeature.append(face)
322
+ else:
323
+ break
324
+ video.release()
325
+ videoFeature = numpy.array(videoFeature)
326
+ length = min(
327
+ (audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100,
328
+ videoFeature.shape[0] / 25,
329
+ )
330
+ audioFeature = audioFeature[: int(round(length * 100)), :]
331
+ videoFeature = videoFeature[: int(round(length * 25)), :, :]
332
+ allScore = [] # Evaluation use TalkNet
333
+ for duration in durationSet:
334
+ batchSize = int(math.ceil(length / duration))
335
+ scores = []
336
+ with torch.no_grad():
337
+ for i in range(batchSize):
338
+ inputA = (
339
+ torch.FloatTensor(
340
+ audioFeature[
341
+ i * duration * 100 : (i + 1) * duration * 100, :
342
+ ]
343
+ )
344
+ .unsqueeze(0)
345
+ .cuda()
346
+ )
347
+ inputV = (
348
+ torch.FloatTensor(
349
+ videoFeature[
350
+ i * duration * 25 : (i + 1) * duration * 25, :, :
351
+ ]
352
+ )
353
+ .unsqueeze(0)
354
+ .cuda()
355
+ )
356
+ embedA = s.model.forward_audio_frontend(inputA)
357
+ embedV = s.model.forward_visual_frontend(inputV)
358
+ embedA, embedV = s.model.forward_cross_attention(embedA, embedV)
359
+ out = s.model.forward_audio_visual_backend(embedA, embedV)
360
+ score = s.lossAV.forward(out, labels=None)
361
+ scores.extend(score)
362
+ allScore.append(scores)
363
+ allScore = numpy.round((numpy.mean(numpy.array(allScore), axis=0)), 1).astype(
364
+ float
365
+ )
366
+ allScores.append(allScore)
367
+ return allScores
368
+
369
+
370
+ def visualization(tracks, scores, args):
371
+ # CPU: visulize the result for video format
372
+ flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
373
+ flist.sort()
374
+ faces = [[] for i in range(len(flist))]
375
+ for tidx, track in enumerate(tracks):
376
+ score = scores[tidx]
377
+ for fidx, frame in enumerate(track["track"]["frame"].tolist()):
378
+ s = score[
379
+ max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)
380
+ ] # average smoothing
381
+ s = numpy.mean(s)
382
+ faces[frame].append(
383
+ {
384
+ "track": tidx,
385
+ "score": float(s),
386
+ "s": track["proc_track"]["s"][fidx],
387
+ "x": track["proc_track"]["x"][fidx],
388
+ "y": track["proc_track"]["y"][fidx],
389
+ }
390
+ )
391
+ firstImage = cv2.imread(flist[0])
392
+ fw = firstImage.shape[1]
393
+ fh = firstImage.shape[0]
394
+ vOut = cv2.VideoWriter(
395
+ os.path.join(args.pyaviPath, "video_only.avi"),
396
+ cv2.VideoWriter_fourcc(*"XVID"),
397
+ 25,
398
+ (fw, fh),
399
+ )
400
+ colorDict = {0: 0, 1: 255}
401
+ for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
402
+ image = cv2.imread(fname)
403
+ for face in faces[fidx]:
404
+ clr = colorDict[int((face["score"] >= 0))]
405
+ txt = round(face["score"], 1)
406
+ cv2.rectangle(
407
+ image,
408
+ (int(face["x"] - face["s"]), int(face["y"] - face["s"])),
409
+ (int(face["x"] + face["s"]), int(face["y"] + face["s"])),
410
+ (0, clr, 255 - clr),
411
+ 10,
412
+ )
413
+ cv2.putText(
414
+ image,
415
+ "%s" % (txt),
416
+ (int(face["x"] - face["s"]), int(face["y"] - face["s"])),
417
+ cv2.FONT_HERSHEY_SIMPLEX,
418
+ 1.5,
419
+ (0, clr, 255 - clr),
420
+ 5,
421
+ )
422
+ vOut.write(image)
423
+ vOut.release()
424
+ command = (
425
+ "ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic"
426
+ % (
427
+ os.path.join(args.pyaviPath, "video_only.avi"),
428
+ os.path.join(args.pyaviPath, "audio.wav"),
429
+ args.nDataLoaderThread,
430
+ os.path.join(args.pyaviPath, "video_out.avi"),
431
+ )
432
+ )
433
+ output = subprocess.call(command, shell=True, stdout=None)
434
+
435
+
436
+ def evaluate_col_ASD(tracks, scores, args):
437
+ txtPath = args.videoFolder + "/col_labels/fusion/*.txt" # Load labels
438
+ predictionSet = {}
439
+ for name in {"long", "bell", "boll", "lieb", "sick", "abbas"}:
440
+ predictionSet[name] = [[], []]
441
+ dictGT = {}
442
+ txtFiles = glob.glob("%s" % txtPath)
443
+ for file in txtFiles:
444
+ lines = open(file).read().splitlines()
445
+ idName = file.split("/")[-1][:-4]
446
+ for line in lines:
447
+ data = line.split("\t")
448
+ frame = int(int(data[0]) / 29.97 * 25)
449
+ x1 = int(data[1])
450
+ y1 = int(data[2])
451
+ x2 = int(data[1]) + int(data[3])
452
+ y2 = int(data[2]) + int(data[3])
453
+ gt = int(data[4])
454
+ if frame in dictGT:
455
+ dictGT[frame].append([x1, y1, x2, y2, gt, idName])
456
+ else:
457
+ dictGT[frame] = [[x1, y1, x2, y2, gt, idName]]
458
+ flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) # Load files
459
+ flist.sort()
460
+ faces = [[] for i in range(len(flist))]
461
+ for tidx, track in enumerate(tracks):
462
+ score = scores[tidx]
463
+ for fidx, frame in enumerate(track["track"]["frame"].tolist()):
464
+ s = numpy.mean(
465
+ score[max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)]
466
+ ) # average smoothing
467
+ faces[frame].append(
468
+ {
469
+ "track": tidx,
470
+ "score": float(s),
471
+ "s": track["proc_track"]["s"][fidx],
472
+ "x": track["proc_track"]["x"][fidx],
473
+ "y": track["proc_track"]["y"][fidx],
474
+ }
475
+ )
476
+ for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
477
+ if fidx in dictGT: # This frame has label
478
+ for gtThisFrame in dictGT[fidx]: # What this label is ?
479
+ faceGT = gtThisFrame[0:4]
480
+ labelGT = gtThisFrame[4]
481
+ idGT = gtThisFrame[5]
482
+ ious = []
483
+ for face in faces[fidx]: # Find the right face in my result
484
+ faceLocation = [
485
+ int(face["x"] - face["s"]),
486
+ int(face["y"] - face["s"]),
487
+ int(face["x"] + face["s"]),
488
+ int(face["y"] + face["s"]),
489
+ ]
490
+ faceLocation_new = [
491
+ int(face["x"] - face["s"]) // 2,
492
+ int(face["y"] - face["s"]) // 2,
493
+ int(face["x"] + face["s"]) // 2,
494
+ int(face["y"] + face["s"]) // 2,
495
+ ]
496
+ iou = bb_intersection_over_union(
497
+ faceLocation_new, faceGT, evalCol=True
498
+ )
499
+ if iou > 0.5:
500
+ ious.append([iou, round(face["score"], 2)])
501
+ if len(ious) > 0: # Find my result
502
+ ious.sort()
503
+ labelPredict = ious[-1][1]
504
+ else:
505
+ labelPredict = 0
506
+ x1 = faceGT[0]
507
+ y1 = faceGT[1]
508
+ width = faceGT[2] - faceGT[0]
509
+ predictionSet[idGT][0].append(labelPredict)
510
+ predictionSet[idGT][1].append(labelGT)
511
+ names = ["long", "bell", "boll", "lieb", "sick", "abbas"] # Evaluate
512
+ names.sort()
513
+ F1s = 0
514
+ for i in names:
515
+ scores = numpy.array(predictionSet[i][0])
516
+ labels = numpy.array(predictionSet[i][1])
517
+ scores = numpy.int64(scores > 0)
518
+ F1 = f1_score(labels, scores)
519
+ ACC = accuracy_score(labels, scores)
520
+ if i != "abbas":
521
+ F1s += F1
522
+ print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1))
523
+ print("Average F1:%.2f" % (100 * (F1s / 5)))
524
+
525
+
526
+ # Main function
527
+ def main():
528
+ # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python).
529
+ # ```
530
+ # .
531
+ # ├── pyavi
532
+ # │   ├── audio.wav (Audio from input video)
533
+ # │   ├── video.avi (Copy of the input video)
534
+ # │   ├── video_only.avi (Output video without audio)
535
+ # │   └── video_out.avi (Output video with audio)
536
+ # ├── pycrop (The detected face videos and audios)
537
+ # │ ├── 000000.avi
538
+ # │ ├── 000000.wav
539
+ # │ ├── 000001.avi
540
+ # │ ├── 000001.wav
541
+ # │ └── ...
542
+ # ├── pyframes (All the video frames in this video)
543
+ # │ ├── 000001.jpg
544
+ # │ ├── 000002.jpg
545
+ # │ └── ...
546
+ # └── pywork
547
+ # ├── faces.pckl (face detection result)
548
+ # ├── scene.pckl (scene detection result)
549
+ # ├── scores.pckl (ASD result)
550
+ # └── tracks.pckl (face tracking result)
551
+ # ```
552
+
553
+ # Initialization
554
+ args.pyaviPath = os.path.join(args.savePath, "pyavi")
555
+ args.pyframesPath = os.path.join(args.savePath, "pyframes")
556
+ args.pyworkPath = os.path.join(args.savePath, "pywork")
557
+ args.pycropPath = os.path.join(args.savePath, "pycrop")
558
+ if os.path.exists(args.savePath):
559
+ rmtree(args.savePath)
560
+ os.makedirs(
561
+ args.pyaviPath, exist_ok=True
562
+ ) # The path for the input video, input audio, output video
563
+ os.makedirs(args.pyframesPath, exist_ok=True) # Save all the video frames
564
+ os.makedirs(
565
+ args.pyworkPath, exist_ok=True
566
+ ) # Save the results in this process by the pckl method
567
+ os.makedirs(
568
+ args.pycropPath, exist_ok=True
569
+ ) # Save the detected face clips (audio+video) in this process
570
+
571
+ # Extract video
572
+ args.videoFilePath = os.path.join(args.pyaviPath, "video.avi")
573
+ # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration'
574
+ if args.duration == 0:
575
+ command = (
576
+ "ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic"
577
+ % (args.videoPath, args.nDataLoaderThread, args.videoFilePath)
578
+ )
579
+ else:
580
+ command = (
581
+ "ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic"
582
+ % (
583
+ args.videoPath,
584
+ args.nDataLoaderThread,
585
+ args.start,
586
+ args.start + args.duration,
587
+ args.videoFilePath,
588
+ )
589
+ )
590
+ subprocess.call(command, shell=True, stdout=None)
591
+ sys.stderr.write(
592
+ time.strftime("%Y-%m-%d %H:%M:%S")
593
+ + " Extract the video and save in %s \r\n" % (args.videoFilePath)
594
+ )
595
+
596
+ # Extract audio
597
+ args.audioFilePath = os.path.join(args.pyaviPath, "audio.wav")
598
+ command = (
599
+ "ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic"
600
+ % (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath)
601
+ )
602
+ subprocess.call(command, shell=True, stdout=None)
603
+ sys.stderr.write(
604
+ time.strftime("%Y-%m-%d %H:%M:%S")
605
+ + " Extract the audio and save in %s \r\n" % (args.audioFilePath)
606
+ )
607
+
608
+ # Extract the video frames
609
+ command = "ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % (
610
+ args.videoFilePath,
611
+ args.nDataLoaderThread,
612
+ os.path.join(args.pyframesPath, "%06d.jpg"),
613
+ )
614
+ subprocess.call(command, shell=True, stdout=None)
615
+ sys.stderr.write(
616
+ time.strftime("%Y-%m-%d %H:%M:%S")
617
+ + " Extract the frames and save in %s \r\n" % (args.pyframesPath)
618
+ )
619
+
620
+ # Scene detection for the video frames
621
+ scene = scene_detect(args)
622
+ sys.stderr.write(
623
+ time.strftime("%Y-%m-%d %H:%M:%S")
624
+ + " Scene detection and save in %s \r\n" % (args.pyworkPath)
625
+ )
626
+
627
+ # Face detection for the video frames
628
+ faces = inference_video(args)
629
+ sys.stderr.write(
630
+ time.strftime("%Y-%m-%d %H:%M:%S")
631
+ + " Face detection and save in %s \r\n" % (args.pyworkPath)
632
+ )
633
+
634
+ # Face tracking
635
+ allTracks, vidTracks = [], []
636
+ for shot in scene:
637
+ if (
638
+ shot[1].frame_num - shot[0].frame_num >= args.minTrack
639
+ ): # Discard the shot frames less than minTrack frames
640
+ allTracks.extend(
641
+ track_shot(args, faces[shot[0].frame_num : shot[1].frame_num])
642
+ ) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces
643
+ sys.stderr.write(
644
+ time.strftime("%Y-%m-%d %H:%M:%S")
645
+ + " Face track and detected %d tracks \r\n" % len(allTracks)
646
+ )
647
+
648
+ # Face clips cropping
649
+ for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)):
650
+ vidTracks.append(
651
+ crop_video(args, track, os.path.join(args.pycropPath, "%05d" % ii))
652
+ )
653
+ savePath = os.path.join(args.pyworkPath, "tracks.pckl")
654
+ with open(savePath, "wb") as fil:
655
+ pickle.dump(vidTracks, fil)
656
+ sys.stderr.write(
657
+ time.strftime("%Y-%m-%d %H:%M:%S")
658
+ + " Face Crop and saved in %s tracks \r\n" % args.pycropPath
659
+ )
660
+ fil = open(savePath, "rb")
661
+ vidTracks = pickle.load(fil)
662
+
663
+ # Active Speaker Detection by TalkNet
664
+ files = glob.glob("%s/*.avi" % args.pycropPath)
665
+ files.sort()
666
+ scores = evaluate_network(files, args)
667
+ savePath = os.path.join(args.pyworkPath, "scores.pckl")
668
+ with open(savePath, "wb") as fil:
669
+ pickle.dump(scores, fil)
670
+ sys.stderr.write(
671
+ time.strftime("%Y-%m-%d %H:%M:%S")
672
+ + " Scores extracted and saved in %s \r\n" % args.pyworkPath
673
+ )
674
+
675
+ if args.evalCol == True:
676
+ evaluate_col_ASD(
677
+ vidTracks, scores, args
678
+ ) # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want
679
+ quit()
680
+ else:
681
+ # Visualization, save the result as the new video
682
+ visualization(vidTracks, scores, args)
683
+
684
+
685
+ if __name__ == "__main__":
686
+ main()
talknet-asd/export_onnx_cpu.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+
5
+ from loss import lossAV
6
+ from model.talkNetModel import talkNetModel
7
+
8
+
9
+ class TalkNetCPU(torch.nn.Module):
10
+ """CPU-only wrapper for TalkNet export."""
11
+
12
+ def __init__(self, ckpt_path: str):
13
+ super().__init__()
14
+ self.model = talkNetModel()
15
+ self.lossAV = lossAV()
16
+ self.ckpt_path = ckpt_path
17
+
18
+ def load_parameters(self) -> None:
19
+ """Load state_dict saved by talkNet.saveParameters (handles module. prefix)."""
20
+ self_state = self.state_dict()
21
+ loaded_state = torch.load(self.ckpt_path, map_location="cpu")
22
+
23
+ for name, param in loaded_state.items():
24
+ orig_name = name
25
+ target_name = name
26
+ if target_name not in self_state:
27
+ target_name = target_name.replace("module.", "")
28
+ if target_name not in self_state:
29
+ print(f"{orig_name} is not in the model.")
30
+ continue
31
+ if self_state[target_name].shape != loaded_state[orig_name].shape:
32
+ print(
33
+ f"Shape mismatch {orig_name}: "
34
+ f"model {self_state[target_name].shape}, "
35
+ f"loaded {loaded_state[orig_name].shape}"
36
+ )
37
+ continue
38
+ self_state[target_name].copy_(param)
39
+
40
+ def forward(self, audio_mfcc: torch.Tensor, video_gray: torch.Tensor) -> torch.Tensor:
41
+ """
42
+ audio_mfcc: (B, Ta, 13)
43
+ video_gray: (B, Tv, 224, 224)
44
+ returns logits: (B*, 2)
45
+ """
46
+ audio_embed = self.model.forward_audio_frontend(audio_mfcc)
47
+ visual_embed = self.model.forward_visual_frontend(video_gray)
48
+ audio_embed, visual_embed = self.model.forward_cross_attention(
49
+ audio_embed, visual_embed
50
+ )
51
+ av_embed = self.model.forward_audio_visual_backend(audio_embed, visual_embed)
52
+ logits = self.lossAV.FC(av_embed)
53
+ return logits
54
+
55
+
56
+ def main() -> None:
57
+ ckpt_path = os.environ.get("CKPT_PATH", "model/pretrain_TalkSet.model")
58
+ out_path = os.environ.get("OUT_PATH", "talknet_asd_cpu.onnx")
59
+
60
+ model = TalkNetCPU(ckpt_path)
61
+ model.load_parameters()
62
+ model.eval()
63
+
64
+ # Dummy inputs only to build the graph; real lengths are dynamic via dynamic_axes.
65
+ dummy_audio = torch.randn(1, 100, 13) # ~1s MFCC (100 frames)
66
+ # Model expects 112x112 (demoTalkNet crops 224->center 112)
67
+ dummy_video = torch.randn(1, 25, 112, 112) # 25 frames of 112x112 gray crops
68
+
69
+ torch.onnx.export(
70
+ model,
71
+ (dummy_audio, dummy_video),
72
+ out_path,
73
+ input_names=["audio_mfcc", "video_gray"],
74
+ output_names=["logits"],
75
+ dynamic_axes={
76
+ "audio_mfcc": {0: "batch", 1: "time_audio"},
77
+ "video_gray": {0: "batch", 1: "time_video"},
78
+ "logits": {0: "time_any"},
79
+ },
80
+ opset_version=14,
81
+ )
82
+ print(f"Saved ONNX to {out_path}")
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
87
+
talknet-asd/loss.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class lossAV(nn.Module):
6
+ def __init__(self):
7
+ super(lossAV, self).__init__()
8
+ self.criterion = nn.CrossEntropyLoss()
9
+ self.FC = nn.Linear(256, 2)
10
+
11
+ def forward(self, x, labels=None):
12
+ x = x.squeeze(1)
13
+ x = self.FC(x)
14
+ if labels == None:
15
+ predScore = x[:,1]
16
+ predScore = predScore.t()
17
+ predScore = predScore.view(-1).detach().cpu().numpy()
18
+ return predScore
19
+ else:
20
+ nloss = self.criterion(x, labels)
21
+ predScore = F.softmax(x, dim = -1)
22
+ predLabel = torch.round(F.softmax(x, dim = -1))[:,1]
23
+ correctNum = (predLabel == labels).sum().float()
24
+ return nloss, predScore, predLabel, correctNum
25
+
26
+ class lossA(nn.Module):
27
+ def __init__(self):
28
+ super(lossA, self).__init__()
29
+ self.criterion = nn.CrossEntropyLoss()
30
+ self.FC = nn.Linear(128, 2)
31
+
32
+ def forward(self, x, labels):
33
+ x = x.squeeze(1)
34
+ x = self.FC(x)
35
+ nloss = self.criterion(x, labels)
36
+ return nloss
37
+
38
+ class lossV(nn.Module):
39
+ def __init__(self):
40
+ super(lossV, self).__init__()
41
+
42
+ self.criterion = nn.CrossEntropyLoss()
43
+ self.FC = nn.Linear(128, 2)
44
+
45
+ def forward(self, x, labels):
46
+ x = x.squeeze(1)
47
+ x = self.FC(x)
48
+ nloss = self.criterion(x, labels)
49
+ return nloss
50
+
talknet-asd/model/attentionLayer.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import functional as F
4
+ from torch.nn import MultiheadAttention
5
+
6
+ class attentionLayer(nn.Module):
7
+
8
+ def __init__(self, d_model, nhead, dropout=0.1):
9
+ super(attentionLayer, self).__init__()
10
+ self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
11
+
12
+ self.linear1 = nn.Linear(d_model, d_model * 4)
13
+ self.dropout = nn.Dropout(dropout)
14
+ self.linear2 = nn.Linear(d_model * 4, d_model)
15
+
16
+ self.norm1 = nn.LayerNorm(d_model)
17
+ self.norm2 = nn.LayerNorm(d_model)
18
+ self.dropout1 = nn.Dropout(dropout)
19
+ self.dropout2 = nn.Dropout(dropout)
20
+
21
+ self.activation = F.relu
22
+
23
+ def forward(self, src, tar):
24
+ # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
25
+ src = src.transpose(0, 1) # B, T, C -> T, B, C
26
+ tar = tar.transpose(0, 1) # B, T, C -> T, B, C
27
+ src2 = self.self_attn(tar, src, src, attn_mask=None,
28
+ key_padding_mask=None)[0]
29
+ src = src + self.dropout1(src2)
30
+ src = self.norm1(src)
31
+
32
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
33
+ src = src + self.dropout2(src2)
34
+ src = self.norm2(src)
35
+ src = src.transpose(0, 1) # T, B, C -> B, T, C
36
+ return src
talknet-asd/model/audioEncoder.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class SEBasicBlock(nn.Module):
6
+ expansion = 1
7
+
8
+ def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
9
+ super(SEBasicBlock, self).__init__()
10
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
11
+ self.bn1 = nn.BatchNorm2d(planes)
12
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
13
+ self.bn2 = nn.BatchNorm2d(planes)
14
+ self.relu = nn.ReLU(inplace=True)
15
+ self.se = SELayer(planes, reduction)
16
+ self.downsample = downsample
17
+ self.stride = stride
18
+
19
+ def forward(self, x):
20
+ residual = x
21
+
22
+ out = self.conv1(x)
23
+ out = self.relu(out)
24
+ out = self.bn1(out)
25
+
26
+ out = self.conv2(out)
27
+ out = self.bn2(out)
28
+ out = self.se(out)
29
+
30
+ if self.downsample is not None:
31
+ residual = self.downsample(x)
32
+
33
+ out += residual
34
+ out = self.relu(out)
35
+ return out
36
+
37
+ class SELayer(nn.Module):
38
+ def __init__(self, channel, reduction=8):
39
+ super(SELayer, self).__init__()
40
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
41
+ self.fc = nn.Sequential(
42
+ nn.Linear(channel, channel // reduction),
43
+ nn.ReLU(inplace=True),
44
+ nn.Linear(channel // reduction, channel),
45
+ nn.Sigmoid()
46
+ )
47
+
48
+ def forward(self, x):
49
+ b, c, _, _ = x.size()
50
+ y = self.avg_pool(x).view(b, c)
51
+ y = self.fc(y).view(b, c, 1, 1)
52
+ return x * y
53
+
54
+ class audioEncoder(nn.Module):
55
+ def __init__(self, layers, num_filters, **kwargs):
56
+ super(audioEncoder, self).__init__()
57
+ block = SEBasicBlock
58
+ self.inplanes = num_filters[0]
59
+
60
+ self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3,
61
+ bias=False)
62
+ self.bn1 = nn.BatchNorm2d(num_filters[0])
63
+ self.relu = nn.ReLU(inplace=True)
64
+
65
+ self.layer1 = self._make_layer(block, num_filters[0], layers[0])
66
+ self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
67
+ self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
68
+ self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
69
+ out_dim = num_filters[3] * block.expansion
70
+
71
+ for m in self.modules():
72
+ if isinstance(m, nn.Conv2d):
73
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
74
+ elif isinstance(m, nn.BatchNorm2d):
75
+ nn.init.constant_(m.weight, 1)
76
+ nn.init.constant_(m.bias, 0)
77
+
78
+ def _make_layer(self, block, planes, blocks, stride=1):
79
+ downsample = None
80
+ if stride != 1 or self.inplanes != planes * block.expansion:
81
+ downsample = nn.Sequential(
82
+ nn.Conv2d(self.inplanes, planes * block.expansion,
83
+ kernel_size=1, stride=stride, bias=False),
84
+ nn.BatchNorm2d(planes * block.expansion),
85
+ )
86
+
87
+ layers = []
88
+ layers.append(block(self.inplanes, planes, stride, downsample))
89
+ self.inplanes = planes * block.expansion
90
+ for i in range(1, blocks):
91
+ layers.append(block(self.inplanes, planes))
92
+
93
+ return nn.Sequential(*layers)
94
+
95
+ def forward(self, x):
96
+ x = self.conv1(x)
97
+ x = self.bn1(x)
98
+ x = self.relu(x)
99
+
100
+ x = self.layer1(x)
101
+ x = self.layer2(x)
102
+ x = self.layer3(x)
103
+ x = self.layer4(x)
104
+ x = torch.mean(x, dim=2, keepdim=True)
105
+ x = x.view((x.size()[0], x.size()[1], -1))
106
+ x = x.transpose(1, 2)
107
+
108
+ return x
talknet-asd/model/faceDetector/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Face detector
2
+
3
+ This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
talknet-asd/model/faceDetector/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .s3fd import S3FD
talknet-asd/model/faceDetector/s3fd/__init__.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time, os, sys, subprocess
2
+ import numpy as np
3
+ import cv2
4
+ import torch
5
+ from torchvision import transforms
6
+ from .nets import S3FDNet
7
+ from .box_utils import nms_
8
+
9
+ PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth'
10
+ if os.path.isfile(PATH_WEIGHT) == False:
11
+ Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt"
12
+ cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT)
13
+ subprocess.call(cmd, shell=True, stdout=None)
14
+ img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
15
+
16
+
17
+ class S3FD():
18
+
19
+ def __init__(self, device='cuda'):
20
+
21
+ tstamp = time.time()
22
+ self.device = device
23
+
24
+ # print('[S3FD] loading with', self.device)
25
+ self.net = S3FDNet(device=self.device).to(self.device)
26
+ PATH = os.path.join(os.getcwd(), PATH_WEIGHT)
27
+ state_dict = torch.load(PATH, map_location=self.device)
28
+ self.net.load_state_dict(state_dict)
29
+ self.net.eval()
30
+ # print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
31
+
32
+ def detect_faces(self, image, conf_th=0.8, scales=[1]):
33
+
34
+ w, h = image.shape[1], image.shape[0]
35
+
36
+ bboxes = np.empty(shape=(0, 5))
37
+
38
+ with torch.no_grad():
39
+ for s in scales:
40
+ scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
41
+
42
+ scaled_img = np.swapaxes(scaled_img, 1, 2)
43
+ scaled_img = np.swapaxes(scaled_img, 1, 0)
44
+ scaled_img = scaled_img[[2, 1, 0], :, :]
45
+ scaled_img = scaled_img.astype('float32')
46
+ scaled_img -= img_mean
47
+ scaled_img = scaled_img[[2, 1, 0], :, :]
48
+ x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
49
+ y = self.net(x)
50
+
51
+ detections = y.data
52
+ scale = torch.Tensor([w, h, w, h])
53
+
54
+ for i in range(detections.size(1)):
55
+ j = 0
56
+ while detections[0, i, j, 0] > conf_th:
57
+ score = detections[0, i, j, 0]
58
+ pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
59
+ bbox = (pt[0], pt[1], pt[2], pt[3], score)
60
+ bboxes = np.vstack((bboxes, bbox))
61
+ j += 1
62
+
63
+ keep = nms_(bboxes, 0.1)
64
+ bboxes = bboxes[keep]
65
+
66
+ return bboxes
talknet-asd/model/faceDetector/s3fd/box_utils.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from itertools import product as product
3
+ import torch
4
+ from torch.autograd import Function
5
+
6
+
7
+ def nms_(dets, thresh):
8
+ """
9
+ Courtesy of Ross Girshick
10
+ [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
11
+ """
12
+ x1 = dets[:, 0]
13
+ y1 = dets[:, 1]
14
+ x2 = dets[:, 2]
15
+ y2 = dets[:, 3]
16
+ scores = dets[:, 4]
17
+
18
+ areas = (x2 - x1) * (y2 - y1)
19
+ order = scores.argsort()[::-1]
20
+
21
+ keep = []
22
+ while order.size > 0:
23
+ i = order[0]
24
+ keep.append(int(i))
25
+ xx1 = np.maximum(x1[i], x1[order[1:]])
26
+ yy1 = np.maximum(y1[i], y1[order[1:]])
27
+ xx2 = np.minimum(x2[i], x2[order[1:]])
28
+ yy2 = np.minimum(y2[i], y2[order[1:]])
29
+
30
+ w = np.maximum(0.0, xx2 - xx1)
31
+ h = np.maximum(0.0, yy2 - yy1)
32
+ inter = w * h
33
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
34
+
35
+ inds = np.where(ovr <= thresh)[0]
36
+ order = order[inds + 1]
37
+
38
+ return np.array(keep).astype(int)
39
+
40
+
41
+ def decode(loc, priors, variances):
42
+ """Decode locations from predictions using priors to undo
43
+ the encoding we did for offset regression at train time.
44
+ Args:
45
+ loc (tensor): location predictions for loc layers,
46
+ Shape: [num_priors,4]
47
+ priors (tensor): Prior boxes in center-offset form.
48
+ Shape: [num_priors,4].
49
+ variances: (list[float]) Variances of priorboxes
50
+ Return:
51
+ decoded bounding box predictions
52
+ """
53
+
54
+ boxes = torch.cat((
55
+ priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
56
+ priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
57
+ boxes[:, :2] -= boxes[:, 2:] / 2
58
+ boxes[:, 2:] += boxes[:, :2]
59
+ return boxes
60
+
61
+
62
+ def nms(boxes, scores, overlap=0.5, top_k=200):
63
+ """Apply non-maximum suppression at test time to avoid detecting too many
64
+ overlapping bounding boxes for a given object.
65
+ Args:
66
+ boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
67
+ scores: (tensor) The class predscores for the img, Shape:[num_priors].
68
+ overlap: (float) The overlap thresh for suppressing unnecessary boxes.
69
+ top_k: (int) The Maximum number of box preds to consider.
70
+ Return:
71
+ The indices of the kept boxes with respect to num_priors.
72
+ """
73
+
74
+ keep = scores.new(scores.size(0)).zero_().long()
75
+ if boxes.numel() == 0:
76
+ return keep, 0
77
+ x1 = boxes[:, 0]
78
+ y1 = boxes[:, 1]
79
+ x2 = boxes[:, 2]
80
+ y2 = boxes[:, 3]
81
+ area = torch.mul(x2 - x1, y2 - y1)
82
+ v, idx = scores.sort(0) # sort in ascending order
83
+ # I = I[v >= 0.01]
84
+ idx = idx[-top_k:] # indices of the top-k largest vals
85
+ xx1 = boxes.new()
86
+ yy1 = boxes.new()
87
+ xx2 = boxes.new()
88
+ yy2 = boxes.new()
89
+ w = boxes.new()
90
+ h = boxes.new()
91
+
92
+ # keep = torch.Tensor()
93
+ count = 0
94
+ while idx.numel() > 0:
95
+ i = idx[-1] # index of current largest val
96
+ # keep.append(i)
97
+ keep[count] = i
98
+ count += 1
99
+ if idx.size(0) == 1:
100
+ break
101
+ idx = idx[:-1] # remove kept element from view
102
+ # load bboxes of next highest vals
103
+ torch.index_select(x1, 0, idx, out=xx1)
104
+ torch.index_select(y1, 0, idx, out=yy1)
105
+ torch.index_select(x2, 0, idx, out=xx2)
106
+ torch.index_select(y2, 0, idx, out=yy2)
107
+ # store element-wise max with next highest score
108
+ xx1 = torch.clamp(xx1, min=x1[i])
109
+ yy1 = torch.clamp(yy1, min=y1[i])
110
+ xx2 = torch.clamp(xx2, max=x2[i])
111
+ yy2 = torch.clamp(yy2, max=y2[i])
112
+ w.resize_as_(xx2)
113
+ h.resize_as_(yy2)
114
+ w = xx2 - xx1
115
+ h = yy2 - yy1
116
+ # check sizes of xx1 and xx2.. after each iteration
117
+ w = torch.clamp(w, min=0.0)
118
+ h = torch.clamp(h, min=0.0)
119
+ inter = w * h
120
+ # IoU = i / (area(a) + area(b) - i)
121
+ rem_areas = torch.index_select(area, 0, idx) # load remaining areas)
122
+ union = (rem_areas - inter) + area[i]
123
+ IoU = inter / union # store result in iou
124
+ # keep only elements with an IoU <= overlap
125
+ idx = idx[IoU.le(overlap)]
126
+ return keep, count
127
+
128
+
129
+ class Detect(object):
130
+
131
+ def __init__(self, num_classes=2,
132
+ top_k=750, nms_thresh=0.3, conf_thresh=0.05,
133
+ variance=[0.1, 0.2], nms_top_k=5000):
134
+
135
+ self.num_classes = num_classes
136
+ self.top_k = top_k
137
+ self.nms_thresh = nms_thresh
138
+ self.conf_thresh = conf_thresh
139
+ self.variance = variance
140
+ self.nms_top_k = nms_top_k
141
+
142
+ def forward(self, loc_data, conf_data, prior_data):
143
+
144
+ num = loc_data.size(0)
145
+ num_priors = prior_data.size(0)
146
+
147
+ conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
148
+ batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
149
+ batch_priors = batch_priors.contiguous().view(-1, 4)
150
+
151
+ decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
152
+ decoded_boxes = decoded_boxes.view(num, num_priors, 4)
153
+
154
+ output = torch.zeros(num, self.num_classes, self.top_k, 5)
155
+
156
+ for i in range(num):
157
+ boxes = decoded_boxes[i].clone()
158
+ conf_scores = conf_preds[i].clone()
159
+
160
+ for cl in range(1, self.num_classes):
161
+ c_mask = conf_scores[cl].gt(self.conf_thresh)
162
+ scores = conf_scores[cl][c_mask]
163
+
164
+ if scores.dim() == 0:
165
+ continue
166
+ l_mask = c_mask.unsqueeze(1).expand_as(boxes)
167
+ boxes_ = boxes[l_mask].view(-1, 4)
168
+ ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
169
+ count = count if count < self.top_k else self.top_k
170
+
171
+ output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
172
+
173
+ return output
174
+
175
+
176
+ class PriorBox(object):
177
+
178
+ def __init__(self, input_size, feature_maps,
179
+ variance=[0.1, 0.2],
180
+ min_sizes=[16, 32, 64, 128, 256, 512],
181
+ steps=[4, 8, 16, 32, 64, 128],
182
+ clip=False):
183
+
184
+ super(PriorBox, self).__init__()
185
+
186
+ self.imh = input_size[0]
187
+ self.imw = input_size[1]
188
+ self.feature_maps = feature_maps
189
+
190
+ self.variance = variance
191
+ self.min_sizes = min_sizes
192
+ self.steps = steps
193
+ self.clip = clip
194
+
195
+ def forward(self):
196
+ mean = []
197
+ for k, fmap in enumerate(self.feature_maps):
198
+ feath = fmap[0]
199
+ featw = fmap[1]
200
+ for i, j in product(range(feath), range(featw)):
201
+ f_kw = self.imw / self.steps[k]
202
+ f_kh = self.imh / self.steps[k]
203
+
204
+ cx = (j + 0.5) / f_kw
205
+ cy = (i + 0.5) / f_kh
206
+
207
+ s_kw = self.min_sizes[k] / self.imw
208
+ s_kh = self.min_sizes[k] / self.imh
209
+
210
+ mean += [cx, cy, s_kw, s_kh]
211
+
212
+ output = torch.FloatTensor(mean).view(-1, 4)
213
+
214
+ if self.clip:
215
+ output.clamp_(max=1, min=0)
216
+
217
+ return output
talknet-asd/model/faceDetector/s3fd/nets.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import torch.nn.init as init
5
+ from .box_utils import Detect, PriorBox
6
+
7
+
8
+ class L2Norm(nn.Module):
9
+
10
+ def __init__(self, n_channels, scale):
11
+ super(L2Norm, self).__init__()
12
+ self.n_channels = n_channels
13
+ self.gamma = scale or None
14
+ self.eps = 1e-10
15
+ self.weight = nn.Parameter(torch.Tensor(self.n_channels))
16
+ self.reset_parameters()
17
+
18
+ def reset_parameters(self):
19
+ init.constant_(self.weight, self.gamma)
20
+
21
+ def forward(self, x):
22
+ norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
23
+ x = torch.div(x, norm)
24
+ out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
25
+ return out
26
+
27
+
28
+ class S3FDNet(nn.Module):
29
+
30
+ def __init__(self, device='cuda'):
31
+ super(S3FDNet, self).__init__()
32
+ self.device = device
33
+
34
+ self.vgg = nn.ModuleList([
35
+ nn.Conv2d(3, 64, 3, 1, padding=1),
36
+ nn.ReLU(inplace=True),
37
+ nn.Conv2d(64, 64, 3, 1, padding=1),
38
+ nn.ReLU(inplace=True),
39
+ nn.MaxPool2d(2, 2),
40
+
41
+ nn.Conv2d(64, 128, 3, 1, padding=1),
42
+ nn.ReLU(inplace=True),
43
+ nn.Conv2d(128, 128, 3, 1, padding=1),
44
+ nn.ReLU(inplace=True),
45
+ nn.MaxPool2d(2, 2),
46
+
47
+ nn.Conv2d(128, 256, 3, 1, padding=1),
48
+ nn.ReLU(inplace=True),
49
+ nn.Conv2d(256, 256, 3, 1, padding=1),
50
+ nn.ReLU(inplace=True),
51
+ nn.Conv2d(256, 256, 3, 1, padding=1),
52
+ nn.ReLU(inplace=True),
53
+ nn.MaxPool2d(2, 2, ceil_mode=True),
54
+
55
+ nn.Conv2d(256, 512, 3, 1, padding=1),
56
+ nn.ReLU(inplace=True),
57
+ nn.Conv2d(512, 512, 3, 1, padding=1),
58
+ nn.ReLU(inplace=True),
59
+ nn.Conv2d(512, 512, 3, 1, padding=1),
60
+ nn.ReLU(inplace=True),
61
+ nn.MaxPool2d(2, 2),
62
+
63
+ nn.Conv2d(512, 512, 3, 1, padding=1),
64
+ nn.ReLU(inplace=True),
65
+ nn.Conv2d(512, 512, 3, 1, padding=1),
66
+ nn.ReLU(inplace=True),
67
+ nn.Conv2d(512, 512, 3, 1, padding=1),
68
+ nn.ReLU(inplace=True),
69
+ nn.MaxPool2d(2, 2),
70
+
71
+ nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
72
+ nn.ReLU(inplace=True),
73
+ nn.Conv2d(1024, 1024, 1, 1),
74
+ nn.ReLU(inplace=True),
75
+ ])
76
+
77
+ self.L2Norm3_3 = L2Norm(256, 10)
78
+ self.L2Norm4_3 = L2Norm(512, 8)
79
+ self.L2Norm5_3 = L2Norm(512, 5)
80
+
81
+ self.extras = nn.ModuleList([
82
+ nn.Conv2d(1024, 256, 1, 1),
83
+ nn.Conv2d(256, 512, 3, 2, padding=1),
84
+ nn.Conv2d(512, 128, 1, 1),
85
+ nn.Conv2d(128, 256, 3, 2, padding=1),
86
+ ])
87
+
88
+ self.loc = nn.ModuleList([
89
+ nn.Conv2d(256, 4, 3, 1, padding=1),
90
+ nn.Conv2d(512, 4, 3, 1, padding=1),
91
+ nn.Conv2d(512, 4, 3, 1, padding=1),
92
+ nn.Conv2d(1024, 4, 3, 1, padding=1),
93
+ nn.Conv2d(512, 4, 3, 1, padding=1),
94
+ nn.Conv2d(256, 4, 3, 1, padding=1),
95
+ ])
96
+
97
+ self.conf = nn.ModuleList([
98
+ nn.Conv2d(256, 4, 3, 1, padding=1),
99
+ nn.Conv2d(512, 2, 3, 1, padding=1),
100
+ nn.Conv2d(512, 2, 3, 1, padding=1),
101
+ nn.Conv2d(1024, 2, 3, 1, padding=1),
102
+ nn.Conv2d(512, 2, 3, 1, padding=1),
103
+ nn.Conv2d(256, 2, 3, 1, padding=1),
104
+ ])
105
+
106
+ self.softmax = nn.Softmax(dim=-1)
107
+ self.detect = Detect()
108
+
109
+ def forward(self, x):
110
+ size = x.size()[2:]
111
+ sources = list()
112
+ loc = list()
113
+ conf = list()
114
+
115
+ for k in range(16):
116
+ x = self.vgg[k](x)
117
+ s = self.L2Norm3_3(x)
118
+ sources.append(s)
119
+
120
+ for k in range(16, 23):
121
+ x = self.vgg[k](x)
122
+ s = self.L2Norm4_3(x)
123
+ sources.append(s)
124
+
125
+ for k in range(23, 30):
126
+ x = self.vgg[k](x)
127
+ s = self.L2Norm5_3(x)
128
+ sources.append(s)
129
+
130
+ for k in range(30, len(self.vgg)):
131
+ x = self.vgg[k](x)
132
+ sources.append(x)
133
+
134
+ # apply extra layers and cache source layer outputs
135
+ for k, v in enumerate(self.extras):
136
+ x = F.relu(v(x), inplace=True)
137
+ if k % 2 == 1:
138
+ sources.append(x)
139
+
140
+ # apply multibox head to source layers
141
+ loc_x = self.loc[0](sources[0])
142
+ conf_x = self.conf[0](sources[0])
143
+
144
+ max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
145
+ conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
146
+
147
+ loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
148
+ conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
149
+
150
+ for i in range(1, len(sources)):
151
+ x = sources[i]
152
+ conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
153
+ loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
154
+
155
+ features_maps = []
156
+ for i in range(len(loc)):
157
+ feat = []
158
+ feat += [loc[i].size(1), loc[i].size(2)]
159
+ features_maps += [feat]
160
+
161
+ loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
162
+ conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
163
+
164
+ with torch.no_grad():
165
+ self.priorbox = PriorBox(size, features_maps)
166
+ self.priors = self.priorbox.forward()
167
+
168
+ output = self.detect.forward(
169
+ loc.view(loc.size(0), -1, 4),
170
+ self.softmax(conf.view(conf.size(0), -1, 2)),
171
+ self.priors.type(type(x.data)).to(self.device)
172
+ )
173
+
174
+ return output
talknet-asd/model/talkNetModel.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from model.audioEncoder import audioEncoder
5
+ from model.visualEncoder import visualFrontend, visualTCN, visualConv1D
6
+ from model.attentionLayer import attentionLayer
7
+
8
+ class talkNetModel(nn.Module):
9
+ def __init__(self):
10
+ super(talkNetModel, self).__init__()
11
+ # Visual Temporal Encoder
12
+ self.visualFrontend = visualFrontend() # Visual Frontend
13
+ # self.visualFrontend.load_state_dict(torch.load('visual_frontend.pt', map_location="cuda"))
14
+ # for param in self.visualFrontend.parameters():
15
+ # param.requires_grad = False
16
+ self.visualTCN = visualTCN() # Visual Temporal Network TCN
17
+ self.visualConv1D = visualConv1D() # Visual Temporal Network Conv1d
18
+
19
+ # Audio Temporal Encoder
20
+ self.audioEncoder = audioEncoder(layers = [3, 4, 6, 3], num_filters = [16, 32, 64, 128])
21
+
22
+ # Audio-visual Cross Attention
23
+ self.crossA2V = attentionLayer(d_model = 128, nhead = 8)
24
+ self.crossV2A = attentionLayer(d_model = 128, nhead = 8)
25
+
26
+ # Audio-visual Self Attention
27
+ self.selfAV = attentionLayer(d_model = 256, nhead = 8)
28
+
29
+ def forward_visual_frontend(self, x):
30
+ B, T, W, H = x.shape
31
+ x = x.view(B*T, 1, 1, W, H)
32
+ x = (x / 255 - 0.4161) / 0.1688
33
+ x = self.visualFrontend(x)
34
+ x = x.view(B, T, 512)
35
+ x = x.transpose(1,2)
36
+ x = self.visualTCN(x)
37
+ x = self.visualConv1D(x)
38
+ x = x.transpose(1,2)
39
+ return x
40
+
41
+ def forward_audio_frontend(self, x):
42
+ x = x.unsqueeze(1).transpose(2, 3)
43
+ x = self.audioEncoder(x)
44
+ return x
45
+
46
+ def forward_cross_attention(self, x1, x2):
47
+ x1_c = self.crossA2V(src = x1, tar = x2)
48
+ x2_c = self.crossV2A(src = x2, tar = x1)
49
+ return x1_c, x2_c
50
+
51
+ def forward_audio_visual_backend(self, x1, x2):
52
+ x = torch.cat((x1,x2), 2)
53
+ x = self.selfAV(src = x, tar = x)
54
+ x = torch.reshape(x, (-1, 256))
55
+ return x
56
+
57
+ def forward_audio_backend(self,x):
58
+ x = torch.reshape(x, (-1, 128))
59
+ return x
60
+
61
+ def forward_visual_backend(self,x):
62
+ x = torch.reshape(x, (-1, 128))
63
+ return x
64
+
talknet-asd/model/visualEncoder.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##
2
+ # ResNet18 Pretrained network to extract lip embedding
3
+ # This code is modified based on https://github.com/lordmartian/deep_avsr
4
+ ##
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+
11
+ class ResNetLayer(nn.Module):
12
+
13
+ """
14
+ A ResNet layer used to build the ResNet network.
15
+ Architecture:
16
+ --> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu -->
17
+ | | | |
18
+ -----> downsample ------> ------------------------------------->
19
+ """
20
+
21
+ def __init__(self, inplanes, outplanes, stride):
22
+ super(ResNetLayer, self).__init__()
23
+ self.conv1a = nn.Conv2d(inplanes, outplanes, kernel_size=3, stride=stride, padding=1, bias=False)
24
+ self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
25
+ self.conv2a = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False)
26
+ self.stride = stride
27
+ self.downsample = nn.Conv2d(inplanes, outplanes, kernel_size=(1,1), stride=stride, bias=False)
28
+ self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
29
+
30
+ self.conv1b = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False)
31
+ self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
32
+ self.conv2b = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False)
33
+ self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
34
+ return
35
+
36
+
37
+ def forward(self, inputBatch):
38
+ batch = F.relu(self.bn1a(self.conv1a(inputBatch)))
39
+ batch = self.conv2a(batch)
40
+ if self.stride == 1:
41
+ residualBatch = inputBatch
42
+ else:
43
+ residualBatch = self.downsample(inputBatch)
44
+ batch = batch + residualBatch
45
+ intermediateBatch = batch
46
+ batch = F.relu(self.outbna(batch))
47
+
48
+ batch = F.relu(self.bn1b(self.conv1b(batch)))
49
+ batch = self.conv2b(batch)
50
+ residualBatch = intermediateBatch
51
+ batch = batch + residualBatch
52
+ outputBatch = F.relu(self.outbnb(batch))
53
+ return outputBatch
54
+
55
+
56
+
57
+ class ResNet(nn.Module):
58
+
59
+ """
60
+ An 18-layer ResNet architecture.
61
+ """
62
+
63
+ def __init__(self):
64
+ super(ResNet, self).__init__()
65
+ self.layer1 = ResNetLayer(64, 64, stride=1)
66
+ self.layer2 = ResNetLayer(64, 128, stride=2)
67
+ self.layer3 = ResNetLayer(128, 256, stride=2)
68
+ self.layer4 = ResNetLayer(256, 512, stride=2)
69
+ self.avgpool = nn.AvgPool2d(kernel_size=(4,4), stride=(1,1))
70
+
71
+ return
72
+
73
+
74
+ def forward(self, inputBatch):
75
+ batch = self.layer1(inputBatch)
76
+ batch = self.layer2(batch)
77
+ batch = self.layer3(batch)
78
+ batch = self.layer4(batch)
79
+ outputBatch = self.avgpool(batch)
80
+ return outputBatch
81
+
82
+
83
+ class GlobalLayerNorm(nn.Module):
84
+ def __init__(self, channel_size):
85
+ super(GlobalLayerNorm, self).__init__()
86
+ self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1]
87
+ self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1]
88
+ self.reset_parameters()
89
+
90
+ def reset_parameters(self):
91
+ self.gamma.data.fill_(1)
92
+ self.beta.data.zero_()
93
+
94
+ def forward(self, y):
95
+ mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) #[M, 1, 1]
96
+ var = (torch.pow(y-mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
97
+ gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta
98
+ return gLN_y
99
+
100
+ class visualFrontend(nn.Module):
101
+
102
+ """
103
+ A visual feature extraction module. Generates a 512-dim feature vector per video frame.
104
+ Architecture: A 3D convolution block followed by an 18-layer ResNet.
105
+ """
106
+
107
+ def __init__(self):
108
+ super(visualFrontend, self).__init__()
109
+ self.frontend3D = nn.Sequential(
110
+ nn.Conv3d(1, 64, kernel_size=(5,7,7), stride=(1,2,2), padding=(2,3,3), bias=False),
111
+ nn.BatchNorm3d(64, momentum=0.01, eps=0.001),
112
+ nn.ReLU(),
113
+ nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1))
114
+ )
115
+ self.resnet = ResNet()
116
+ return
117
+
118
+
119
+ def forward(self, inputBatch):
120
+ inputBatch = inputBatch.transpose(0, 1).transpose(1, 2)
121
+ batchsize = inputBatch.shape[0]
122
+ batch = self.frontend3D(inputBatch)
123
+
124
+ batch = batch.transpose(1, 2)
125
+ batch = batch.reshape(batch.shape[0]*batch.shape[1], batch.shape[2], batch.shape[3], batch.shape[4])
126
+ outputBatch = self.resnet(batch)
127
+ outputBatch = outputBatch.reshape(batchsize, -1, 512)
128
+ outputBatch = outputBatch.transpose(1 ,2)
129
+ outputBatch = outputBatch.transpose(1, 2).transpose(0, 1)
130
+ return outputBatch
131
+
132
+ class DSConv1d(nn.Module):
133
+ def __init__(self):
134
+ super(DSConv1d, self).__init__()
135
+ self.net = nn.Sequential(
136
+ nn.ReLU(),
137
+ nn.BatchNorm1d(512),
138
+ nn.Conv1d(512, 512, 3, stride=1, padding=1,dilation=1, groups=512, bias=False),
139
+ nn.PReLU(),
140
+ GlobalLayerNorm(512),
141
+ nn.Conv1d(512, 512, 1, bias=False),
142
+ )
143
+
144
+ def forward(self, x):
145
+ out = self.net(x)
146
+ return out + x
147
+
148
+ class visualTCN(nn.Module):
149
+ def __init__(self):
150
+ super(visualTCN, self).__init__()
151
+ stacks = []
152
+ for x in range(5):
153
+ stacks += [DSConv1d()]
154
+ self.net = nn.Sequential(*stacks) # Visual Temporal Network V-TCN
155
+
156
+ def forward(self, x):
157
+ out = self.net(x)
158
+ return out
159
+
160
+ class visualConv1D(nn.Module):
161
+ def __init__(self):
162
+ super(visualConv1D, self).__init__()
163
+ self.net = nn.Sequential(
164
+ nn.Conv1d(512, 256, 5, stride=1, padding=2),
165
+ nn.BatchNorm1d(256),
166
+ nn.ReLU(),
167
+ nn.Conv1d(256, 128, 1),
168
+ )
169
+
170
+ def forward(self, x):
171
+ out = self.net(x)
172
+ return out
talknet-asd/predict.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import json
4
+ import glob
5
+ import pickle
6
+ import shutil
7
+ import subprocess
8
+ from typing import List, Optional
9
+ from cog import BasePredictor, BaseModel, Input, Path
10
+
11
+
12
+ class Output(BaseModel):
13
+ media_path: Optional[List[Path]]
14
+ json_str: Optional[str]
15
+
16
+
17
+ class Predictor(BasePredictor):
18
+ def setup(self):
19
+ pass
20
+
21
+ def predict(
22
+ self,
23
+ video: Path = Input(description="Path to the video"),
24
+ face_det_scale: float = Input(
25
+ default=0.25,
26
+ description="Scale factor for face detection, the frames will be scaled to 0.25 of the original",
27
+ ge=0,
28
+ le=1,
29
+ ),
30
+ min_track: int = Input(
31
+ default=10, description="Number of min frames for each shot"
32
+ ),
33
+ num_failed_det: int = Input(
34
+ default=10,
35
+ description="Number of missed detections allowed before tracking is stopped",
36
+ ge=1,
37
+ ),
38
+ min_face_size: int = Input(
39
+ default=1, description="Minimum face size in pixels", ge=1
40
+ ),
41
+ crop_scale: float = Input(
42
+ default=0.40, description="Scale bounding box", ge=0, le=1
43
+ ),
44
+ start: int = Input(default=0, description="The start time of the video", ge=0),
45
+ duration: int = Input(
46
+ default=-1,
47
+ description="The duration of the video, when set as -1, will extract the whole video",
48
+ ),
49
+ return_json: bool = Input(
50
+ description="Return results in json format", default=True
51
+ ),
52
+ return_boundingbox_percentages: bool = Input(
53
+ description="Return bounding box coordinates as percentages of the video width and height",
54
+ default=False,
55
+ ),
56
+ ) -> Output:
57
+
58
+ video_path = str(video)
59
+ video_name = os.path.splitext(os.path.basename(video_path))[0]
60
+ video_folder = "demo"
61
+
62
+ # Clean up and create the video folder
63
+ shutil.rmtree(video_folder, ignore_errors=True)
64
+ os.makedirs(video_folder, exist_ok=True)
65
+
66
+ # Copy the input video to the video folder
67
+ target_video_path = os.path.join(video_folder, os.path.basename(video_path))
68
+ shutil.copy(video_path, target_video_path)
69
+
70
+ duration = max(0, duration)
71
+ n_data_loader_thread = 32
72
+
73
+ # Run the demoTalkNet.py script with the provided arguments
74
+ command = (
75
+ f"python demoTalkNet.py --videoName {video_name} "
76
+ f"--videoFolder {video_folder} "
77
+ f"--pretrainModel pretrain_TalkSet.model "
78
+ f"--nDataLoaderThread {n_data_loader_thread} "
79
+ f"--facedetScale {face_det_scale} "
80
+ f"--minTrack {min_track} "
81
+ f"--numFailedDet {num_failed_det} "
82
+ f"--minFaceSize {min_face_size} "
83
+ f"--cropScale {crop_scale} "
84
+ f"--start {start} "
85
+ f"--duration {duration} "
86
+ )
87
+
88
+ process = subprocess.Popen(
89
+ command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
90
+ )
91
+ stdout, stderr = process.communicate()
92
+ print(f"Command output: {stdout.decode()}")
93
+ if stderr:
94
+ print(f"Command errors: {stderr.decode()}")
95
+
96
+ # Find the most recent pywork folder
97
+ pywork_folders = glob.glob(os.path.join(video_folder, "*", "pywork"))
98
+ latest_pywork_folder = max(pywork_folders, key=os.path.getctime)
99
+
100
+ # Load the face tracks and scores from the pickle files generated by demoTalkNet.py
101
+ tracks_file = os.path.join(latest_pywork_folder, "tracks.pckl")
102
+ scores_file = os.path.join(latest_pywork_folder, "scores.pckl")
103
+ with open(tracks_file, "rb") as f:
104
+ face_tracks = pickle.load(f) # list
105
+ with open(scores_file, "rb") as f:
106
+ scores = pickle.load(f) # list
107
+
108
+ # Get the video dimensions
109
+ video = cv2.VideoCapture(target_video_path)
110
+ video_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
111
+ video_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
112
+ video.release()
113
+
114
+ # Convert face tracks and scores to the desired JSON format
115
+ output_data = []
116
+ for track_idx, track in enumerate(face_tracks):
117
+ # Get the frame numbers for the current track
118
+ frames = track["track"]["frame"]
119
+
120
+ # Get the bounding box information for the current track
121
+ boxes = track["proc_track"]
122
+
123
+ # Get the speaking scores for the current track
124
+ # If the track index is out of range, use an empty list
125
+ speaking_scores = scores[track_idx] if track_idx < len(scores) else []
126
+
127
+ for i, frame in enumerate(frames):
128
+ # Check if the current index is within the valid range of the bounding box information
129
+ # If not, break the loop and move to the next track
130
+ if i >= len(boxes["x"]) or i >= len(boxes["y"]) or i >= len(boxes["s"]):
131
+ break
132
+
133
+ # Calculate bounding box coordinates
134
+ x0 = int(boxes["x"][i] - boxes["s"][i])
135
+ y0 = int(boxes["y"][i] - boxes["s"][i])
136
+ x1 = int(boxes["x"][i] + boxes["s"][i])
137
+ y1 = int(boxes["y"][i] + boxes["s"][i])
138
+
139
+ # Normalize the bounding box coordinates if required
140
+ if return_boundingbox_percentages:
141
+ x0 /= video_width
142
+ y0 /= video_height
143
+ x1 /= video_width
144
+ y1 /= video_height
145
+
146
+ # Determine speaking status
147
+ speaking = (
148
+ bool(speaking_scores[i] >= 0) if i < len(speaking_scores) else False
149
+ )
150
+
151
+ # Create the bounding box dictionary
152
+ box = {
153
+ "face_id": track_idx,
154
+ "x0": x0,
155
+ "y0": y0,
156
+ "x1": x1,
157
+ "y1": y1,
158
+ "speaking": speaking,
159
+ }
160
+
161
+ # Create a dictionary for each frame if it doesn't exist
162
+ frame_data = next(
163
+ (
164
+ data
165
+ for data in output_data
166
+ if data["frame_number"] == int(frame)
167
+ ),
168
+ None,
169
+ )
170
+ if frame_data is None:
171
+ frame_data = {"frame_number": int(frame), "faces": []}
172
+ output_data.append(frame_data)
173
+
174
+ # Add the current face's bounding box and speaking status to the frame's data
175
+ frame_data["faces"].append(box)
176
+
177
+ # Convert the output data to JSON string
178
+ json_str = json.dumps(output_data)
179
+
180
+ if return_json:
181
+ return Output(json_str=json_str)
182
+ else:
183
+ mp4_files = []
184
+ excluded_files = ["video_only.avi", "video.avi"]
185
+ avi_files = [
186
+ avi_file
187
+ for avi_file in Path(video_folder).rglob("*.avi")
188
+ if avi_file.name not in excluded_files
189
+ ]
190
+ for avi_file in avi_files:
191
+ mp4_file = avi_file.with_suffix(".mp4")
192
+ conversion_command = f"ffmpeg -i {avi_file} {mp4_file}"
193
+ conversion_process = subprocess.run(
194
+ conversion_command,
195
+ shell=True,
196
+ stdout=subprocess.PIPE,
197
+ stderr=subprocess.PIPE,
198
+ )
199
+ if conversion_process.returncode == 0:
200
+ mp4_files.append(Path(mp4_file))
201
+ return Output(media_path=mp4_files)
talknet-asd/sanity_check.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
talknet-asd/talkNet.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ import sys, time, numpy, os, subprocess, pandas, tqdm
6
+
7
+ from loss import lossAV, lossA, lossV
8
+ from model.talkNetModel import talkNetModel
9
+
10
+ class talkNet(nn.Module):
11
+ def __init__(self, lr = 0.0001, lrDecay = 0.95, **kwargs):
12
+ super(talkNet, self).__init__()
13
+ self.model = talkNetModel().cuda()
14
+ self.lossAV = lossAV().cuda()
15
+ self.lossA = lossA().cuda()
16
+ self.lossV = lossV().cuda()
17
+ self.optim = torch.optim.Adam(self.parameters(), lr = lr)
18
+ self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size = 1, gamma=lrDecay)
19
+ print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f"%(sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
20
+
21
+ def train_network(self, loader, epoch, **kwargs):
22
+ self.train()
23
+ self.scheduler.step(epoch - 1)
24
+ index, top1, loss = 0, 0, 0
25
+ lr = self.optim.param_groups[0]['lr']
26
+ for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1):
27
+ self.zero_grad()
28
+ audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) # feedForward
29
+ visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
30
+ audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
31
+ outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
32
+ outsA = self.model.forward_audio_backend(audioEmbed)
33
+ outsV = self.model.forward_visual_backend(visualEmbed)
34
+ labels = labels[0].reshape((-1)).cuda() # Loss
35
+ nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels)
36
+ nlossA = self.lossA.forward(outsA, labels)
37
+ nlossV = self.lossV.forward(outsV, labels)
38
+ nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
39
+ loss += nloss.detach().cpu().numpy()
40
+ top1 += prec
41
+ nloss.backward()
42
+ self.optim.step()
43
+ index += len(labels)
44
+ sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \
45
+ " [%2d] Lr: %5f, Training: %.2f%%, " %(epoch, lr, 100 * (num / loader.__len__())) + \
46
+ " Loss: %.5f, ACC: %2.2f%% \r" %(loss/(num), 100 * (top1/index)))
47
+ sys.stderr.flush()
48
+ sys.stdout.write("\n")
49
+ return loss/num, lr
50
+
51
+ def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs):
52
+ self.eval()
53
+ predScores = []
54
+ for audioFeature, visualFeature, labels in tqdm.tqdm(loader):
55
+ with torch.no_grad():
56
+ audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda())
57
+ visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
58
+ audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
59
+ outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
60
+ labels = labels[0].reshape((-1)).cuda()
61
+ _, predScore, _, _ = self.lossAV.forward(outsAV, labels)
62
+ predScore = predScore[:,1].detach().cpu().numpy()
63
+ predScores.extend(predScore)
64
+ evalLines = open(evalOrig).read().splitlines()[1:]
65
+ labels = []
66
+ labels = pandas.Series( ['SPEAKING_AUDIBLE' for line in evalLines])
67
+ scores = pandas.Series(predScores)
68
+ evalRes = pandas.read_csv(evalOrig)
69
+ evalRes['score'] = scores
70
+ evalRes['label'] = labels
71
+ evalRes.drop(['label_id'], axis=1,inplace=True)
72
+ evalRes.drop(['instance_id'], axis=1,inplace=True)
73
+ evalRes.to_csv(evalCsvSave, index=False)
74
+ cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s "%(evalOrig, evalCsvSave)
75
+ mAP = float(str(subprocess.run(cmd, shell=True, capture_output =True).stdout).split(' ')[2][:5])
76
+ return mAP
77
+
78
+ def saveParameters(self, path):
79
+ torch.save(self.state_dict(), path)
80
+
81
+ def loadParameters(self, path):
82
+ selfState = self.state_dict()
83
+ loadedState = torch.load(path)
84
+ for name, param in loadedState.items():
85
+ origName = name;
86
+ if name not in selfState:
87
+ name = name.replace("module.", "")
88
+ if name not in selfState:
89
+ print("%s is not in the model."%origName)
90
+ continue
91
+ if selfState[name].size() != loadedState[origName].size():
92
+ sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s"%(origName, selfState[name].size(), loadedState[origName].size()))
93
+ continue
94
+ selfState[name].copy_(param)