diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..6f392d5b73382ac8a6c71a1f29a6fffafc4ad2d8 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +facelandmarker/face_landmarker.task filter=lfs diff=lfs merge=lfs -text +insightface/models/arcface_r100_v1/model-0000.params filter=lfs diff=lfs merge=lfs -text +insightface/models/genderage_v1/model-0000.params filter=lfs diff=lfs merge=lfs -text +insightface/models/retinaface_r50_v1/R50-0000.params filter=lfs diff=lfs merge=lfs -text +talknet-asd/utils/overall.png filter=lfs diff=lfs merge=lfs -text +yolo-face-person-detector/images/image.png filter=lfs diff=lfs merge=lfs -text +yolo-face-person-detector/images/output.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/facelandmarker/face_landmarker.task b/facelandmarker/face_landmarker.task new file mode 100644 index 0000000000000000000000000000000000000000..fedb14de6d2b6708a56c04ae259783e23404c1aa --- /dev/null +++ b/facelandmarker/face_landmarker.task @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff +size 3758596 diff --git a/insightface/.gitattributes b/insightface/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..1db2eb93a50d1d1e46461ccc83610212455996d4 --- /dev/null +++ b/insightface/.gitattributes @@ -0,0 +1,30 @@ +*.params filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/insightface/README.md b/insightface/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ed87ccbf4de7172bcf36ea456fefaf29d1502805 --- /dev/null +++ b/insightface/README.md @@ -0,0 +1,22 @@ +# insightface + +- https://github.com/deepinsight/insightface + - SCRFD + - https://github.com/deepinsight/insightface/tree/master/detection/scrfd + - https://1drv.ms/u/s!AswpsDO2toNKqyYWxScdiTITY4TQ?e=DjXof9 + - https://1drv.ms/u/s!AswpsDO2toNKqyPVLI44ahNBsOMR?e=esPrBL + - https://1drv.ms/u/s!AswpsDO2toNKqyTIXnzB1ujPq4th?e=5t1VNv + - https://1drv.ms/u/s!AswpsDO2toNKqyUKwTiwXv2kaa8o?e=umfepO + - https://1drv.ms/u/s!AswpsDO2toNKqyKZwFebVlmlOvzz?e=V2rqUy + - https://1drv.ms/u/s!AswpsDO2toNKri_NDM0GIkPpkE2f?e=JkebJo + - https://1drv.ms/u/s!AswpsDO2toNKqyGlhxnCg3smyQqX?e=A6Hufm + - https://1drv.ms/u/s!AswpsDO2toNKqyGlhxnCg3smyQqX?e=A6Hufm + - Person Detection + - https://github.com/deepinsight/insightface/tree/master/examples/person_detection + - https://github.com/deepinsight/insightface/releases/download/v0.7/scrfd_person_2.5g.onnx + - Face Alignment (FaceSynthetics) + - https://github.com/deepinsight/insightface/tree/master/alignment/synthetics + - https://drive.google.com/file/d/1kNP7qEl3AYNbaHFUg_ZiyRB1CtfDWXR4/view?usp=sharing + - buffalo_l + - https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip + diff --git a/insightface/models/arcface_r100_v1/model-0000.params b/insightface/models/arcface_r100_v1/model-0000.params new file mode 100644 index 0000000000000000000000000000000000000000..aa3434ba7f922ae0ca8fd98fff4c0ba01c4d043b --- /dev/null +++ b/insightface/models/arcface_r100_v1/model-0000.params @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:931257c0b7174254fd81314706f2591cc6d1dd7299275bb8cf01c774ed0da8be +size 260958682 diff --git a/insightface/models/arcface_r100_v1/model-symbol.json b/insightface/models/arcface_r100_v1/model-symbol.json new file mode 100644 index 0000000000000000000000000000000000000000..0b66f27a127ea92159fbe7b50fb8ce3625f844cf --- /dev/null +++ b/insightface/models/arcface_r100_v1/model-symbol.json @@ -0,0 +1,13635 @@ +{ + "nodes": [ + { + "op": "null", + "name": "data", + "inputs": [] + }, + { + "op": "_copy", + "name": "id", + "inputs": [[0, 0, 0]] + }, + { + "op": "_minus_scalar", + "name": "_minusscalar0", + "attrs": {"scalar": "127.5"}, + "inputs": [[1, 0, 0]] + }, + { + "op": "_mul_scalar", + "name": "_mulscalar0", + "attrs": {"scalar": "0.0078125"}, + "inputs": [[2, 0, 0]] + }, + { + "op": "null", + "name": "conv0_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv0", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[3, 0, 0], [4, 0, 0]] + }, + { + "op": "null", + "name": "bn0_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn0_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn0_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn0_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "bn0", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[5, 0, 0], [6, 0, 0], [7, 0, 0], [8, 0, 1], [9, 0, 1]] + }, + { + "op": "null", + "name": "relu0_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "relu0", + "attrs": {"act_type": "prelu"}, + "inputs": [[10, 0, 0], [11, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit1_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[12, 0, 0], [13, 0, 0], [14, 0, 0], [15, 0, 1], [16, 0, 1]] + }, + { + "op": "null", + "name": "stage1_unit1_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit1_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[17, 0, 0], [18, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit1_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[19, 0, 0], [20, 0, 0], [21, 0, 0], [22, 0, 1], [23, 0, 1]] + }, + { + "op": "null", + "name": "stage1_unit1_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage1_unit1_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[24, 0, 0], [25, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit1_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [[26, 0, 0], [27, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit1_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[28, 0, 0], [29, 0, 0], [30, 0, 0], [31, 0, 1], [32, 0, 1]] + }, + { + "op": "null", + "name": "stage1_unit1_conv1sc_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "64", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit1_conv1sc", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "64", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [[12, 0, 0], [34, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_sc_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_sc_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_sc_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_sc_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit1_sc", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[35, 0, 0], [36, 0, 0], [37, 0, 0], [38, 0, 1], [39, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus0", + "inputs": [[33, 0, 0], [40, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit2_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[41, 0, 0], [42, 0, 0], [43, 0, 0], [44, 0, 1], [45, 0, 1]] + }, + { + "op": "null", + "name": "stage1_unit2_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit2_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[46, 0, 0], [47, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit2_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[48, 0, 0], [49, 0, 0], [50, 0, 0], [51, 0, 1], [52, 0, 1]] + }, + { + "op": "null", + "name": "stage1_unit2_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage1_unit2_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[53, 0, 0], [54, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit2_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[55, 0, 0], [56, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit2_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[57, 0, 0], [58, 0, 0], [59, 0, 0], [60, 0, 1], [61, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus1", + "inputs": [[62, 0, 0], [41, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit3_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[63, 0, 0], [64, 0, 0], [65, 0, 0], [66, 0, 1], [67, 0, 1]] + }, + { + "op": "null", + "name": "stage1_unit3_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit3_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[68, 0, 0], [69, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit3_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[70, 0, 0], [71, 0, 0], [72, 0, 0], [73, 0, 1], [74, 0, 1]] + }, + { + "op": "null", + "name": "stage1_unit3_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage1_unit3_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[75, 0, 0], [76, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit3_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[77, 0, 0], [78, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit3_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[79, 0, 0], [80, 0, 0], [81, 0, 0], [82, 0, 1], [83, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus2", + "inputs": [[84, 0, 0], [63, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit1_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[85, 0, 0], [86, 0, 0], [87, 0, 0], [88, 0, 1], [89, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit1_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit1_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[90, 0, 0], [91, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit1_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[92, 0, 0], [93, 0, 0], [94, 0, 0], [95, 0, 1], [96, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit1_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit1_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[97, 0, 0], [98, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit1_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [[99, 0, 0], [100, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit1_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[101, 0, 0], [102, 0, 0], [103, 0, 0], [104, 0, 1], [105, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit1_conv1sc_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit1_conv1sc", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [[85, 0, 0], [107, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_sc_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_sc_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_sc_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_sc_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit1_sc", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[108, 0, 0], [109, 0, 0], [110, 0, 0], [111, 0, 1], [112, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus3", + "inputs": [[106, 0, 0], [113, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit2_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[114, 0, 0], [115, 0, 0], [116, 0, 0], [117, 0, 1], [118, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit2_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit2_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[119, 0, 0], [120, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit2_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[121, 0, 0], [122, 0, 0], [123, 0, 0], [124, 0, 1], [125, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit2_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit2_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[126, 0, 0], [127, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit2_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[128, 0, 0], [129, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit2_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[130, 0, 0], [131, 0, 0], [132, 0, 0], [133, 0, 1], [134, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus4", + "inputs": [[135, 0, 0], [114, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit3_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[136, 0, 0], [137, 0, 0], [138, 0, 0], [139, 0, 1], [140, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit3_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit3_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[141, 0, 0], [142, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit3_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[143, 0, 0], [144, 0, 0], [145, 0, 0], [146, 0, 1], [147, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit3_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit3_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[148, 0, 0], [149, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit3_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[150, 0, 0], [151, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit3_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[152, 0, 0], [153, 0, 0], [154, 0, 0], [155, 0, 1], [156, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus5", + "inputs": [[157, 0, 0], [136, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit4_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[158, 0, 0], [159, 0, 0], [160, 0, 0], [161, 0, 1], [162, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit4_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit4_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[163, 0, 0], [164, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit4_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[165, 0, 0], [166, 0, 0], [167, 0, 0], [168, 0, 1], [169, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit4_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit4_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[170, 0, 0], [171, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit4_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[172, 0, 0], [173, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit4_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[174, 0, 0], [175, 0, 0], [176, 0, 0], [177, 0, 1], [178, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus6", + "inputs": [[179, 0, 0], [158, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit5_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit5_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[180, 0, 0], [181, 0, 0], [182, 0, 0], [183, 0, 1], [184, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit5_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit5_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[185, 0, 0], [186, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit5_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit5_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[187, 0, 0], [188, 0, 0], [189, 0, 0], [190, 0, 1], [191, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit5_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit5_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[192, 0, 0], [193, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit5_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit5_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[194, 0, 0], [195, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit5_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit5_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit5_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[196, 0, 0], [197, 0, 0], [198, 0, 0], [199, 0, 1], [200, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus7", + "inputs": [[201, 0, 0], [180, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit6_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit6_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[202, 0, 0], [203, 0, 0], [204, 0, 0], [205, 0, 1], [206, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit6_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit6_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[207, 0, 0], [208, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit6_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit6_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[209, 0, 0], [210, 0, 0], [211, 0, 0], [212, 0, 1], [213, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit6_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit6_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[214, 0, 0], [215, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit6_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit6_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[216, 0, 0], [217, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit6_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit6_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit6_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[218, 0, 0], [219, 0, 0], [220, 0, 0], [221, 0, 1], [222, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus8", + "inputs": [[223, 0, 0], [202, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit7_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit7_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[224, 0, 0], [225, 0, 0], [226, 0, 0], [227, 0, 1], [228, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit7_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit7_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[229, 0, 0], [230, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit7_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit7_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[231, 0, 0], [232, 0, 0], [233, 0, 0], [234, 0, 1], [235, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit7_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit7_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[236, 0, 0], [237, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit7_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit7_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[238, 0, 0], [239, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit7_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit7_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit7_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[240, 0, 0], [241, 0, 0], [242, 0, 0], [243, 0, 1], [244, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus9", + "inputs": [[245, 0, 0], [224, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit8_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit8_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[246, 0, 0], [247, 0, 0], [248, 0, 0], [249, 0, 1], [250, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit8_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit8_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[251, 0, 0], [252, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit8_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit8_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[253, 0, 0], [254, 0, 0], [255, 0, 0], [256, 0, 1], [257, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit8_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit8_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[258, 0, 0], [259, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit8_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit8_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[260, 0, 0], [261, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit8_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit8_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit8_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[262, 0, 0], [263, 0, 0], [264, 0, 0], [265, 0, 1], [266, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus10", + "inputs": [[267, 0, 0], [246, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit9_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit9_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[268, 0, 0], [269, 0, 0], [270, 0, 0], [271, 0, 1], [272, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit9_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit9_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[273, 0, 0], [274, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit9_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit9_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[275, 0, 0], [276, 0, 0], [277, 0, 0], [278, 0, 1], [279, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit9_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit9_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[280, 0, 0], [281, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit9_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit9_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[282, 0, 0], [283, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit9_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit9_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit9_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[284, 0, 0], [285, 0, 0], [286, 0, 0], [287, 0, 1], [288, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus11", + "inputs": [[289, 0, 0], [268, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit10_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit10_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[290, 0, 0], [291, 0, 0], [292, 0, 0], [293, 0, 1], [294, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit10_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit10_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[295, 0, 0], [296, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit10_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit10_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[297, 0, 0], [298, 0, 0], [299, 0, 0], [300, 0, 1], [301, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit10_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit10_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[302, 0, 0], [303, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit10_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit10_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[304, 0, 0], [305, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit10_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit10_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit10_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[306, 0, 0], [307, 0, 0], [308, 0, 0], [309, 0, 1], [310, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus12", + "inputs": [[311, 0, 0], [290, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit11_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit11_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[312, 0, 0], [313, 0, 0], [314, 0, 0], [315, 0, 1], [316, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit11_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit11_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[317, 0, 0], [318, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit11_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit11_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[319, 0, 0], [320, 0, 0], [321, 0, 0], [322, 0, 1], [323, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit11_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit11_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[324, 0, 0], [325, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit11_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit11_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[326, 0, 0], [327, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit11_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit11_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit11_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[328, 0, 0], [329, 0, 0], [330, 0, 0], [331, 0, 1], [332, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus13", + "inputs": [[333, 0, 0], [312, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit12_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit12_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[334, 0, 0], [335, 0, 0], [336, 0, 0], [337, 0, 1], [338, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit12_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit12_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[339, 0, 0], [340, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit12_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit12_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[341, 0, 0], [342, 0, 0], [343, 0, 0], [344, 0, 1], [345, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit12_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit12_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[346, 0, 0], [347, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit12_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit12_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[348, 0, 0], [349, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit12_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit12_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit12_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[350, 0, 0], [351, 0, 0], [352, 0, 0], [353, 0, 1], [354, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus14", + "inputs": [[355, 0, 0], [334, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit13_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit13_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[356, 0, 0], [357, 0, 0], [358, 0, 0], [359, 0, 1], [360, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit13_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit13_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[361, 0, 0], [362, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit13_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit13_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[363, 0, 0], [364, 0, 0], [365, 0, 0], [366, 0, 1], [367, 0, 1]] + }, + { + "op": "null", + "name": "stage2_unit13_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage2_unit13_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[368, 0, 0], [369, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit13_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit13_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[370, 0, 0], [371, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit13_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit13_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit13_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[372, 0, 0], [373, 0, 0], [374, 0, 0], [375, 0, 1], [376, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus15", + "inputs": [[377, 0, 0], [356, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit1_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[378, 0, 0], [379, 0, 0], [380, 0, 0], [381, 0, 1], [382, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit1_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit1_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[383, 0, 0], [384, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit1_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[385, 0, 0], [386, 0, 0], [387, 0, 0], [388, 0, 1], [389, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit1_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit1_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[390, 0, 0], [391, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit1_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [[392, 0, 0], [393, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit1_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[394, 0, 0], [395, 0, 0], [396, 0, 0], [397, 0, 1], [398, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit1_conv1sc_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "256", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit1_conv1sc", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "256", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [[378, 0, 0], [400, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_sc_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_sc_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_sc_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_sc_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit1_sc", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[401, 0, 0], [402, 0, 0], [403, 0, 0], [404, 0, 1], [405, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus16", + "inputs": [[399, 0, 0], [406, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit2_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[407, 0, 0], [408, 0, 0], [409, 0, 0], [410, 0, 1], [411, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit2_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit2_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[412, 0, 0], [413, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit2_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[414, 0, 0], [415, 0, 0], [416, 0, 0], [417, 0, 1], [418, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit2_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit2_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[419, 0, 0], [420, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit2_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[421, 0, 0], [422, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit2_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[423, 0, 0], [424, 0, 0], [425, 0, 0], [426, 0, 1], [427, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus17", + "inputs": [[428, 0, 0], [407, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit3_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[429, 0, 0], [430, 0, 0], [431, 0, 0], [432, 0, 1], [433, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit3_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit3_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[434, 0, 0], [435, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit3_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[436, 0, 0], [437, 0, 0], [438, 0, 0], [439, 0, 1], [440, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit3_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit3_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[441, 0, 0], [442, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit3_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[443, 0, 0], [444, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit3_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[445, 0, 0], [446, 0, 0], [447, 0, 0], [448, 0, 1], [449, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus18", + "inputs": [[450, 0, 0], [429, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit4_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[451, 0, 0], [452, 0, 0], [453, 0, 0], [454, 0, 1], [455, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit4_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit4_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[456, 0, 0], [457, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit4_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[458, 0, 0], [459, 0, 0], [460, 0, 0], [461, 0, 1], [462, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit4_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit4_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[463, 0, 0], [464, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit4_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[465, 0, 0], [466, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit4_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[467, 0, 0], [468, 0, 0], [469, 0, 0], [470, 0, 1], [471, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus19", + "inputs": [[472, 0, 0], [451, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit5_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[473, 0, 0], [474, 0, 0], [475, 0, 0], [476, 0, 1], [477, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit5_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit5_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[478, 0, 0], [479, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit5_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[480, 0, 0], [481, 0, 0], [482, 0, 0], [483, 0, 1], [484, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit5_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit5_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[485, 0, 0], [486, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit5_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[487, 0, 0], [488, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit5_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[489, 0, 0], [490, 0, 0], [491, 0, 0], [492, 0, 1], [493, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus20", + "inputs": [[494, 0, 0], [473, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit6_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[495, 0, 0], [496, 0, 0], [497, 0, 0], [498, 0, 1], [499, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit6_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit6_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[500, 0, 0], [501, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit6_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[502, 0, 0], [503, 0, 0], [504, 0, 0], [505, 0, 1], [506, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit6_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit6_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[507, 0, 0], [508, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit6_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[509, 0, 0], [510, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit6_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[511, 0, 0], [512, 0, 0], [513, 0, 0], [514, 0, 1], [515, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus21", + "inputs": [[516, 0, 0], [495, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit7_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit7_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[517, 0, 0], [518, 0, 0], [519, 0, 0], [520, 0, 1], [521, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit7_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit7_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[522, 0, 0], [523, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit7_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit7_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[524, 0, 0], [525, 0, 0], [526, 0, 0], [527, 0, 1], [528, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit7_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit7_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[529, 0, 0], [530, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit7_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit7_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[531, 0, 0], [532, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit7_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit7_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit7_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[533, 0, 0], [534, 0, 0], [535, 0, 0], [536, 0, 1], [537, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus22", + "inputs": [[538, 0, 0], [517, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit8_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit8_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[539, 0, 0], [540, 0, 0], [541, 0, 0], [542, 0, 1], [543, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit8_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit8_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[544, 0, 0], [545, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit8_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit8_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[546, 0, 0], [547, 0, 0], [548, 0, 0], [549, 0, 1], [550, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit8_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit8_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[551, 0, 0], [552, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit8_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit8_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[553, 0, 0], [554, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit8_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit8_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit8_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[555, 0, 0], [556, 0, 0], [557, 0, 0], [558, 0, 1], [559, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus23", + "inputs": [[560, 0, 0], [539, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit9_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit9_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[561, 0, 0], [562, 0, 0], [563, 0, 0], [564, 0, 1], [565, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit9_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit9_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[566, 0, 0], [567, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit9_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit9_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[568, 0, 0], [569, 0, 0], [570, 0, 0], [571, 0, 1], [572, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit9_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit9_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[573, 0, 0], [574, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit9_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit9_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[575, 0, 0], [576, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit9_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit9_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit9_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[577, 0, 0], [578, 0, 0], [579, 0, 0], [580, 0, 1], [581, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus24", + "inputs": [[582, 0, 0], [561, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit10_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit10_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[583, 0, 0], [584, 0, 0], [585, 0, 0], [586, 0, 1], [587, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit10_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit10_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[588, 0, 0], [589, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit10_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit10_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[590, 0, 0], [591, 0, 0], [592, 0, 0], [593, 0, 1], [594, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit10_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit10_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[595, 0, 0], [596, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit10_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit10_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[597, 0, 0], [598, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit10_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit10_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit10_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[599, 0, 0], [600, 0, 0], [601, 0, 0], [602, 0, 1], [603, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus25", + "inputs": [[604, 0, 0], [583, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit11_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit11_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[605, 0, 0], [606, 0, 0], [607, 0, 0], [608, 0, 1], [609, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit11_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit11_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[610, 0, 0], [611, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit11_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit11_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[612, 0, 0], [613, 0, 0], [614, 0, 0], [615, 0, 1], [616, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit11_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit11_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[617, 0, 0], [618, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit11_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit11_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[619, 0, 0], [620, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit11_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit11_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit11_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[621, 0, 0], [622, 0, 0], [623, 0, 0], [624, 0, 1], [625, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus26", + "inputs": [[626, 0, 0], [605, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit12_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit12_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[627, 0, 0], [628, 0, 0], [629, 0, 0], [630, 0, 1], [631, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit12_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit12_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[632, 0, 0], [633, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit12_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit12_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[634, 0, 0], [635, 0, 0], [636, 0, 0], [637, 0, 1], [638, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit12_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit12_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[639, 0, 0], [640, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit12_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit12_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[641, 0, 0], [642, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit12_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit12_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit12_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[643, 0, 0], [644, 0, 0], [645, 0, 0], [646, 0, 1], [647, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus27", + "inputs": [[648, 0, 0], [627, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit13_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit13_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[649, 0, 0], [650, 0, 0], [651, 0, 0], [652, 0, 1], [653, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit13_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit13_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[654, 0, 0], [655, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit13_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit13_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[656, 0, 0], [657, 0, 0], [658, 0, 0], [659, 0, 1], [660, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit13_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit13_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[661, 0, 0], [662, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit13_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit13_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[663, 0, 0], [664, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit13_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit13_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit13_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[665, 0, 0], [666, 0, 0], [667, 0, 0], [668, 0, 1], [669, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus28", + "inputs": [[670, 0, 0], [649, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit14_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit14_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[671, 0, 0], [672, 0, 0], [673, 0, 0], [674, 0, 1], [675, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit14_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit14_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[676, 0, 0], [677, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit14_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit14_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[678, 0, 0], [679, 0, 0], [680, 0, 0], [681, 0, 1], [682, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit14_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit14_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[683, 0, 0], [684, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit14_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit14_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[685, 0, 0], [686, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit14_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit14_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit14_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[687, 0, 0], [688, 0, 0], [689, 0, 0], [690, 0, 1], [691, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus29", + "inputs": [[692, 0, 0], [671, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit15_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit15_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[693, 0, 0], [694, 0, 0], [695, 0, 0], [696, 0, 1], [697, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit15_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit15_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[698, 0, 0], [699, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit15_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit15_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[700, 0, 0], [701, 0, 0], [702, 0, 0], [703, 0, 1], [704, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit15_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit15_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[705, 0, 0], [706, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit15_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit15_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[707, 0, 0], [708, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit15_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit15_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit15_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[709, 0, 0], [710, 0, 0], [711, 0, 0], [712, 0, 1], [713, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus30", + "inputs": [[714, 0, 0], [693, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit16_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit16_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[715, 0, 0], [716, 0, 0], [717, 0, 0], [718, 0, 1], [719, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit16_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit16_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[720, 0, 0], [721, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit16_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit16_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[722, 0, 0], [723, 0, 0], [724, 0, 0], [725, 0, 1], [726, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit16_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit16_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[727, 0, 0], [728, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit16_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit16_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[729, 0, 0], [730, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit16_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit16_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit16_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[731, 0, 0], [732, 0, 0], [733, 0, 0], [734, 0, 1], [735, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus31", + "inputs": [[736, 0, 0], [715, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit17_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit17_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[737, 0, 0], [738, 0, 0], [739, 0, 0], [740, 0, 1], [741, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit17_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit17_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[742, 0, 0], [743, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit17_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit17_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[744, 0, 0], [745, 0, 0], [746, 0, 0], [747, 0, 1], [748, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit17_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit17_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[749, 0, 0], [750, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit17_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit17_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[751, 0, 0], [752, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit17_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit17_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit17_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[753, 0, 0], [754, 0, 0], [755, 0, 0], [756, 0, 1], [757, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus32", + "inputs": [[758, 0, 0], [737, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit18_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit18_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[759, 0, 0], [760, 0, 0], [761, 0, 0], [762, 0, 1], [763, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit18_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit18_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[764, 0, 0], [765, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit18_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit18_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[766, 0, 0], [767, 0, 0], [768, 0, 0], [769, 0, 1], [770, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit18_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit18_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[771, 0, 0], [772, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit18_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit18_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[773, 0, 0], [774, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit18_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit18_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit18_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[775, 0, 0], [776, 0, 0], [777, 0, 0], [778, 0, 1], [779, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus33", + "inputs": [[780, 0, 0], [759, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit19_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit19_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[781, 0, 0], [782, 0, 0], [783, 0, 0], [784, 0, 1], [785, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit19_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit19_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[786, 0, 0], [787, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit19_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit19_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[788, 0, 0], [789, 0, 0], [790, 0, 0], [791, 0, 1], [792, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit19_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit19_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[793, 0, 0], [794, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit19_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit19_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[795, 0, 0], [796, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit19_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit19_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit19_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[797, 0, 0], [798, 0, 0], [799, 0, 0], [800, 0, 1], [801, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus34", + "inputs": [[802, 0, 0], [781, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit20_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit20_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[803, 0, 0], [804, 0, 0], [805, 0, 0], [806, 0, 1], [807, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit20_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit20_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[808, 0, 0], [809, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit20_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit20_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[810, 0, 0], [811, 0, 0], [812, 0, 0], [813, 0, 1], [814, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit20_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit20_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[815, 0, 0], [816, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit20_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit20_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[817, 0, 0], [818, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit20_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit20_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit20_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[819, 0, 0], [820, 0, 0], [821, 0, 0], [822, 0, 1], [823, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus35", + "inputs": [[824, 0, 0], [803, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit21_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit21_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[825, 0, 0], [826, 0, 0], [827, 0, 0], [828, 0, 1], [829, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit21_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit21_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[830, 0, 0], [831, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit21_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit21_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[832, 0, 0], [833, 0, 0], [834, 0, 0], [835, 0, 1], [836, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit21_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit21_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[837, 0, 0], [838, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit21_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit21_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[839, 0, 0], [840, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit21_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit21_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit21_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[841, 0, 0], [842, 0, 0], [843, 0, 0], [844, 0, 1], [845, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus36", + "inputs": [[846, 0, 0], [825, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit22_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit22_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[847, 0, 0], [848, 0, 0], [849, 0, 0], [850, 0, 1], [851, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit22_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit22_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[852, 0, 0], [853, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit22_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit22_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[854, 0, 0], [855, 0, 0], [856, 0, 0], [857, 0, 1], [858, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit22_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit22_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[859, 0, 0], [860, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit22_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit22_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[861, 0, 0], [862, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit22_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit22_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit22_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[863, 0, 0], [864, 0, 0], [865, 0, 0], [866, 0, 1], [867, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus37", + "inputs": [[868, 0, 0], [847, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit23_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit23_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[869, 0, 0], [870, 0, 0], [871, 0, 0], [872, 0, 1], [873, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit23_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit23_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[874, 0, 0], [875, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit23_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit23_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[876, 0, 0], [877, 0, 0], [878, 0, 0], [879, 0, 1], [880, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit23_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit23_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[881, 0, 0], [882, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit23_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit23_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[883, 0, 0], [884, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit23_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit23_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit23_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[885, 0, 0], [886, 0, 0], [887, 0, 0], [888, 0, 1], [889, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus38", + "inputs": [[890, 0, 0], [869, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit24_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit24_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[891, 0, 0], [892, 0, 0], [893, 0, 0], [894, 0, 1], [895, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit24_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit24_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[896, 0, 0], [897, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit24_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit24_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[898, 0, 0], [899, 0, 0], [900, 0, 0], [901, 0, 1], [902, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit24_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit24_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[903, 0, 0], [904, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit24_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit24_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[905, 0, 0], [906, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit24_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit24_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit24_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[907, 0, 0], [908, 0, 0], [909, 0, 0], [910, 0, 1], [911, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus39", + "inputs": [[912, 0, 0], [891, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit25_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit25_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[913, 0, 0], [914, 0, 0], [915, 0, 0], [916, 0, 1], [917, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit25_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit25_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[918, 0, 0], [919, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit25_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit25_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[920, 0, 0], [921, 0, 0], [922, 0, 0], [923, 0, 1], [924, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit25_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit25_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[925, 0, 0], [926, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit25_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit25_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[927, 0, 0], [928, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit25_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit25_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit25_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[929, 0, 0], [930, 0, 0], [931, 0, 0], [932, 0, 1], [933, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus40", + "inputs": [[934, 0, 0], [913, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit26_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit26_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[935, 0, 0], [936, 0, 0], [937, 0, 0], [938, 0, 1], [939, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit26_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit26_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[940, 0, 0], [941, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit26_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit26_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[942, 0, 0], [943, 0, 0], [944, 0, 0], [945, 0, 1], [946, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit26_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit26_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[947, 0, 0], [948, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit26_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit26_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[949, 0, 0], [950, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit26_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit26_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit26_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[951, 0, 0], [952, 0, 0], [953, 0, 0], [954, 0, 1], [955, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus41", + "inputs": [[956, 0, 0], [935, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit27_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit27_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[957, 0, 0], [958, 0, 0], [959, 0, 0], [960, 0, 1], [961, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit27_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit27_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[962, 0, 0], [963, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit27_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit27_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[964, 0, 0], [965, 0, 0], [966, 0, 0], [967, 0, 1], [968, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit27_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit27_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[969, 0, 0], [970, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit27_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit27_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[971, 0, 0], [972, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit27_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit27_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit27_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[973, 0, 0], [974, 0, 0], [975, 0, 0], [976, 0, 1], [977, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus42", + "inputs": [[978, 0, 0], [957, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit28_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit28_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[979, 0, 0], [980, 0, 0], [981, 0, 0], [982, 0, 1], [983, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit28_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit28_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[984, 0, 0], [985, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit28_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit28_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[986, 0, 0], [987, 0, 0], [988, 0, 0], [989, 0, 1], [990, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit28_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit28_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[991, 0, 0], [992, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit28_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit28_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[993, 0, 0], [994, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit28_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit28_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit28_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[995, 0, 0], [996, 0, 0], [997, 0, 0], [998, 0, 1], [999, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus43", + "inputs": [[1000, 0, 0], [979, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit29_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit29_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1001, 0, 0], [1002, 0, 0], [1003, 0, 0], [1004, 0, 1], [1005, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit29_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit29_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1006, 0, 0], [1007, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit29_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit29_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1008, 0, 0], [1009, 0, 0], [1010, 0, 0], [1011, 0, 1], [1012, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit29_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit29_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[1013, 0, 0], [1014, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit29_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit29_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1015, 0, 0], [1016, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit29_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit29_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit29_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1017, 0, 0], [1018, 0, 0], [1019, 0, 0], [1020, 0, 1], [1021, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus44", + "inputs": [[1022, 0, 0], [1001, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit30_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit30_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1023, 0, 0], [1024, 0, 0], [1025, 0, 0], [1026, 0, 1], [1027, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit30_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit30_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1028, 0, 0], [1029, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit30_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit30_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1030, 0, 0], [1031, 0, 0], [1032, 0, 0], [1033, 0, 1], [1034, 0, 1]] + }, + { + "op": "null", + "name": "stage3_unit30_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage3_unit30_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[1035, 0, 0], [1036, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit30_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit30_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1037, 0, 0], [1038, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit30_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit30_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit30_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1039, 0, 0], [1040, 0, 0], [1041, 0, 0], [1042, 0, 1], [1043, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus45", + "inputs": [[1044, 0, 0], [1023, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit1_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1045, 0, 0], [1046, 0, 0], [1047, 0, 0], [1048, 0, 1], [1049, 0, 1]] + }, + { + "op": "null", + "name": "stage4_unit1_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit1_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1050, 0, 0], [1051, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit1_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1052, 0, 0], [1053, 0, 0], [1054, 0, 0], [1055, 0, 1], [1056, 0, 1]] + }, + { + "op": "null", + "name": "stage4_unit1_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage4_unit1_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[1057, 0, 0], [1058, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit1_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [[1059, 0, 0], [1060, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit1_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1061, 0, 0], [1062, 0, 0], [1063, 0, 0], [1064, 0, 1], [1065, 0, 1]] + }, + { + "op": "null", + "name": "stage4_unit1_conv1sc_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "512", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit1_conv1sc", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "512", + "stride": "(2, 2)", + "workspace": "256" + }, + "inputs": [[1045, 0, 0], [1067, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_sc_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_sc_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_sc_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_sc_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit1_sc", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1068, 0, 0], [1069, 0, 0], [1070, 0, 0], [1071, 0, 1], [1072, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus46", + "inputs": [[1066, 0, 0], [1073, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit2_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1074, 0, 0], [1075, 0, 0], [1076, 0, 0], [1077, 0, 1], [1078, 0, 1]] + }, + { + "op": "null", + "name": "stage4_unit2_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit2_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1079, 0, 0], [1080, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit2_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1081, 0, 0], [1082, 0, 0], [1083, 0, 0], [1084, 0, 1], [1085, 0, 1]] + }, + { + "op": "null", + "name": "stage4_unit2_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage4_unit2_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[1086, 0, 0], [1087, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit2_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1088, 0, 0], [1089, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit2_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1090, 0, 0], [1091, 0, 0], [1092, 0, 0], [1093, 0, 1], [1094, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus47", + "inputs": [[1095, 0, 0], [1074, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit3_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1096, 0, 0], [1097, 0, 0], [1098, 0, 0], [1099, 0, 1], [1100, 0, 1]] + }, + { + "op": "null", + "name": "stage4_unit3_conv1_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit3_conv1", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1101, 0, 0], [1102, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_bn2_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn2_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit3_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1103, 0, 0], [1104, 0, 0], [1105, 0, 0], [1106, 0, 1], [1107, 0, 1]] + }, + { + "op": "null", + "name": "stage4_unit3_relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "stage4_unit3_relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[1108, 0, 0], [1109, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_conv2_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit3_conv2", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "512", + "pad": "(1, 1)", + "stride": "(1, 1)", + "workspace": "256" + }, + "inputs": [[1110, 0, 0], [1111, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_bn3_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn3_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit3_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1112, 0, 0], [1113, 0, 0], [1114, 0, 0], [1115, 0, 1], [1116, 0, 1]] + }, + { + "op": "elemwise_add", + "name": "_plus48", + "inputs": [[1117, 0, 0], [1096, 0, 0]] + }, + { + "op": "null", + "name": "bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[1118, 0, 0], [1119, 0, 0], [1120, 0, 0], [1121, 0, 1], [1122, 0, 1]] + }, + { + "op": "Dropout", + "name": "dropout0", + "attrs": {"p": "0.4"}, + "inputs": [[1123, 0, 0]] + }, + { + "op": "null", + "name": "pre_fc1_weight", + "attrs": {"num_hidden": "512"}, + "inputs": [] + }, + { + "op": "null", + "name": "pre_fc1_bias", + "attrs": {"num_hidden": "512"}, + "inputs": [] + }, + { + "op": "FullyConnected", + "name": "pre_fc1", + "attrs": {"num_hidden": "512"}, + "inputs": [[1124, 0, 0], [1125, 0, 0], [1126, 0, 0]] + }, + { + "op": "null", + "name": "fc1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "fc1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "fc1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "fc1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "fc1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [[1127, 0, 0], [1128, 0, 0], [1129, 0, 0], [1130, 0, 1], [1131, 0, 1]] + } + ], + "arg_nodes": [ + 0, + 4, + 6, + 7, + 8, + 9, + 11, + 13, + 14, + 15, + 16, + 18, + 20, + 21, + 22, + 23, + 25, + 27, + 29, + 30, + 31, + 32, + 34, + 36, + 37, + 38, + 39, + 42, + 43, + 44, + 45, + 47, + 49, + 50, + 51, + 52, + 54, + 56, + 58, + 59, + 60, + 61, + 64, + 65, + 66, + 67, + 69, + 71, + 72, + 73, + 74, + 76, + 78, + 80, + 81, + 82, + 83, + 86, + 87, + 88, + 89, + 91, + 93, + 94, + 95, + 96, + 98, + 100, + 102, + 103, + 104, + 105, + 107, + 109, + 110, + 111, + 112, + 115, + 116, + 117, + 118, + 120, + 122, + 123, + 124, + 125, + 127, + 129, + 131, + 132, + 133, + 134, + 137, + 138, + 139, + 140, + 142, + 144, + 145, + 146, + 147, + 149, + 151, + 153, + 154, + 155, + 156, + 159, + 160, + 161, + 162, + 164, + 166, + 167, + 168, + 169, + 171, + 173, + 175, + 176, + 177, + 178, + 181, + 182, + 183, + 184, + 186, + 188, + 189, + 190, + 191, + 193, + 195, + 197, + 198, + 199, + 200, + 203, + 204, + 205, + 206, + 208, + 210, + 211, + 212, + 213, + 215, + 217, + 219, + 220, + 221, + 222, + 225, + 226, + 227, + 228, + 230, + 232, + 233, + 234, + 235, + 237, + 239, + 241, + 242, + 243, + 244, + 247, + 248, + 249, + 250, + 252, + 254, + 255, + 256, + 257, + 259, + 261, + 263, + 264, + 265, + 266, + 269, + 270, + 271, + 272, + 274, + 276, + 277, + 278, + 279, + 281, + 283, + 285, + 286, + 287, + 288, + 291, + 292, + 293, + 294, + 296, + 298, + 299, + 300, + 301, + 303, + 305, + 307, + 308, + 309, + 310, + 313, + 314, + 315, + 316, + 318, + 320, + 321, + 322, + 323, + 325, + 327, + 329, + 330, + 331, + 332, + 335, + 336, + 337, + 338, + 340, + 342, + 343, + 344, + 345, + 347, + 349, + 351, + 352, + 353, + 354, + 357, + 358, + 359, + 360, + 362, + 364, + 365, + 366, + 367, + 369, + 371, + 373, + 374, + 375, + 376, + 379, + 380, + 381, + 382, + 384, + 386, + 387, + 388, + 389, + 391, + 393, + 395, + 396, + 397, + 398, + 400, + 402, + 403, + 404, + 405, + 408, + 409, + 410, + 411, + 413, + 415, + 416, + 417, + 418, + 420, + 422, + 424, + 425, + 426, + 427, + 430, + 431, + 432, + 433, + 435, + 437, + 438, + 439, + 440, + 442, + 444, + 446, + 447, + 448, + 449, + 452, + 453, + 454, + 455, + 457, + 459, + 460, + 461, + 462, + 464, + 466, + 468, + 469, + 470, + 471, + 474, + 475, + 476, + 477, + 479, + 481, + 482, + 483, + 484, + 486, + 488, + 490, + 491, + 492, + 493, + 496, + 497, + 498, + 499, + 501, + 503, + 504, + 505, + 506, + 508, + 510, + 512, + 513, + 514, + 515, + 518, + 519, + 520, + 521, + 523, + 525, + 526, + 527, + 528, + 530, + 532, + 534, + 535, + 536, + 537, + 540, + 541, + 542, + 543, + 545, + 547, + 548, + 549, + 550, + 552, + 554, + 556, + 557, + 558, + 559, + 562, + 563, + 564, + 565, + 567, + 569, + 570, + 571, + 572, + 574, + 576, + 578, + 579, + 580, + 581, + 584, + 585, + 586, + 587, + 589, + 591, + 592, + 593, + 594, + 596, + 598, + 600, + 601, + 602, + 603, + 606, + 607, + 608, + 609, + 611, + 613, + 614, + 615, + 616, + 618, + 620, + 622, + 623, + 624, + 625, + 628, + 629, + 630, + 631, + 633, + 635, + 636, + 637, + 638, + 640, + 642, + 644, + 645, + 646, + 647, + 650, + 651, + 652, + 653, + 655, + 657, + 658, + 659, + 660, + 662, + 664, + 666, + 667, + 668, + 669, + 672, + 673, + 674, + 675, + 677, + 679, + 680, + 681, + 682, + 684, + 686, + 688, + 689, + 690, + 691, + 694, + 695, + 696, + 697, + 699, + 701, + 702, + 703, + 704, + 706, + 708, + 710, + 711, + 712, + 713, + 716, + 717, + 718, + 719, + 721, + 723, + 724, + 725, + 726, + 728, + 730, + 732, + 733, + 734, + 735, + 738, + 739, + 740, + 741, + 743, + 745, + 746, + 747, + 748, + 750, + 752, + 754, + 755, + 756, + 757, + 760, + 761, + 762, + 763, + 765, + 767, + 768, + 769, + 770, + 772, + 774, + 776, + 777, + 778, + 779, + 782, + 783, + 784, + 785, + 787, + 789, + 790, + 791, + 792, + 794, + 796, + 798, + 799, + 800, + 801, + 804, + 805, + 806, + 807, + 809, + 811, + 812, + 813, + 814, + 816, + 818, + 820, + 821, + 822, + 823, + 826, + 827, + 828, + 829, + 831, + 833, + 834, + 835, + 836, + 838, + 840, + 842, + 843, + 844, + 845, + 848, + 849, + 850, + 851, + 853, + 855, + 856, + 857, + 858, + 860, + 862, + 864, + 865, + 866, + 867, + 870, + 871, + 872, + 873, + 875, + 877, + 878, + 879, + 880, + 882, + 884, + 886, + 887, + 888, + 889, + 892, + 893, + 894, + 895, + 897, + 899, + 900, + 901, + 902, + 904, + 906, + 908, + 909, + 910, + 911, + 914, + 915, + 916, + 917, + 919, + 921, + 922, + 923, + 924, + 926, + 928, + 930, + 931, + 932, + 933, + 936, + 937, + 938, + 939, + 941, + 943, + 944, + 945, + 946, + 948, + 950, + 952, + 953, + 954, + 955, + 958, + 959, + 960, + 961, + 963, + 965, + 966, + 967, + 968, + 970, + 972, + 974, + 975, + 976, + 977, + 980, + 981, + 982, + 983, + 985, + 987, + 988, + 989, + 990, + 992, + 994, + 996, + 997, + 998, + 999, + 1002, + 1003, + 1004, + 1005, + 1007, + 1009, + 1010, + 1011, + 1012, + 1014, + 1016, + 1018, + 1019, + 1020, + 1021, + 1024, + 1025, + 1026, + 1027, + 1029, + 1031, + 1032, + 1033, + 1034, + 1036, + 1038, + 1040, + 1041, + 1042, + 1043, + 1046, + 1047, + 1048, + 1049, + 1051, + 1053, + 1054, + 1055, + 1056, + 1058, + 1060, + 1062, + 1063, + 1064, + 1065, + 1067, + 1069, + 1070, + 1071, + 1072, + 1075, + 1076, + 1077, + 1078, + 1080, + 1082, + 1083, + 1084, + 1085, + 1087, + 1089, + 1091, + 1092, + 1093, + 1094, + 1097, + 1098, + 1099, + 1100, + 1102, + 1104, + 1105, + 1106, + 1107, + 1109, + 1111, + 1113, + 1114, + 1115, + 1116, + 1119, + 1120, + 1121, + 1122, + 1125, + 1126, + 1128, + 1129, + 1130, + 1131 + ], + "node_row_ptr": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 51, + 52, + 53, + 54, + 55, + 56, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 79, + 80, + 81, + 82, + 83, + 84, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 107, + 108, + 109, + 110, + 111, + 112, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 144, + 145, + 146, + 147, + 148, + 149, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 172, + 173, + 174, + 175, + 176, + 177, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 200, + 201, + 202, + 203, + 204, + 205, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 228, + 229, + 230, + 231, + 232, + 233, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 256, + 257, + 258, + 259, + 260, + 261, + 264, + 265, + 266, + 267, + 268, + 269, + 270, + 273, + 274, + 275, + 276, + 277, + 278, + 279, + 280, + 281, + 284, + 285, + 286, + 287, + 288, + 289, + 292, + 293, + 294, + 295, + 296, + 297, + 298, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 309, + 312, + 313, + 314, + 315, + 316, + 317, + 320, + 321, + 322, + 323, + 324, + 325, + 326, + 329, + 330, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 340, + 341, + 342, + 343, + 344, + 345, + 348, + 349, + 350, + 351, + 352, + 353, + 354, + 357, + 358, + 359, + 360, + 361, + 362, + 363, + 364, + 365, + 368, + 369, + 370, + 371, + 372, + 373, + 376, + 377, + 378, + 379, + 380, + 381, + 382, + 385, + 386, + 387, + 388, + 389, + 390, + 391, + 392, + 393, + 396, + 397, + 398, + 399, + 400, + 401, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 413, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 424, + 425, + 426, + 427, + 428, + 429, + 432, + 433, + 434, + 435, + 436, + 437, + 438, + 441, + 442, + 443, + 444, + 445, + 446, + 447, + 448, + 449, + 452, + 453, + 454, + 455, + 456, + 457, + 460, + 461, + 462, + 463, + 464, + 465, + 466, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 476, + 477, + 480, + 481, + 482, + 483, + 484, + 485, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 497, + 498, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 508, + 509, + 510, + 511, + 512, + 513, + 514, + 517, + 518, + 519, + 520, + 521, + 522, + 525, + 526, + 527, + 528, + 529, + 530, + 531, + 534, + 535, + 536, + 537, + 538, + 539, + 540, + 541, + 542, + 545, + 546, + 547, + 548, + 549, + 550, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 573, + 574, + 575, + 576, + 577, + 578, + 581, + 582, + 583, + 584, + 585, + 586, + 587, + 590, + 591, + 592, + 593, + 594, + 595, + 596, + 597, + 598, + 601, + 602, + 603, + 604, + 605, + 606, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 629, + 630, + 631, + 632, + 633, + 634, + 637, + 638, + 639, + 640, + 641, + 642, + 643, + 646, + 647, + 648, + 649, + 650, + 651, + 652, + 653, + 654, + 657, + 658, + 659, + 660, + 661, + 662, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 685, + 686, + 687, + 688, + 689, + 690, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 702, + 703, + 704, + 705, + 706, + 707, + 708, + 709, + 710, + 713, + 714, + 715, + 716, + 717, + 718, + 721, + 722, + 723, + 724, + 725, + 726, + 727, + 730, + 731, + 732, + 733, + 734, + 735, + 736, + 737, + 738, + 741, + 742, + 743, + 744, + 745, + 746, + 749, + 750, + 751, + 752, + 753, + 754, + 755, + 758, + 759, + 760, + 761, + 762, + 763, + 764, + 765, + 766, + 769, + 770, + 771, + 772, + 773, + 774, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 786, + 787, + 788, + 789, + 790, + 791, + 792, + 793, + 794, + 797, + 798, + 799, + 800, + 801, + 802, + 805, + 806, + 807, + 808, + 809, + 810, + 811, + 814, + 815, + 816, + 817, + 818, + 819, + 820, + 821, + 822, + 825, + 826, + 827, + 828, + 829, + 830, + 833, + 834, + 835, + 836, + 837, + 838, + 839, + 842, + 843, + 844, + 845, + 846, + 847, + 848, + 849, + 850, + 853, + 854, + 855, + 856, + 857, + 858, + 861, + 862, + 863, + 864, + 865, + 866, + 867, + 870, + 871, + 872, + 873, + 874, + 875, + 876, + 877, + 878, + 881, + 882, + 883, + 884, + 885, + 886, + 889, + 890, + 891, + 892, + 893, + 894, + 895, + 898, + 899, + 900, + 901, + 902, + 903, + 904, + 905, + 906, + 909, + 910, + 911, + 912, + 913, + 914, + 917, + 918, + 919, + 920, + 921, + 922, + 923, + 926, + 927, + 928, + 929, + 930, + 931, + 932, + 933, + 934, + 937, + 938, + 939, + 940, + 941, + 942, + 945, + 946, + 947, + 948, + 949, + 950, + 951, + 954, + 955, + 956, + 957, + 958, + 959, + 960, + 961, + 962, + 965, + 966, + 967, + 968, + 969, + 970, + 973, + 974, + 975, + 976, + 977, + 978, + 979, + 982, + 983, + 984, + 985, + 986, + 987, + 988, + 989, + 990, + 993, + 994, + 995, + 996, + 997, + 998, + 1001, + 1002, + 1003, + 1004, + 1005, + 1006, + 1007, + 1010, + 1011, + 1012, + 1013, + 1014, + 1015, + 1016, + 1017, + 1018, + 1021, + 1022, + 1023, + 1024, + 1025, + 1026, + 1029, + 1030, + 1031, + 1032, + 1033, + 1034, + 1035, + 1038, + 1039, + 1040, + 1041, + 1042, + 1043, + 1044, + 1045, + 1046, + 1049, + 1050, + 1051, + 1052, + 1053, + 1054, + 1057, + 1058, + 1059, + 1060, + 1061, + 1062, + 1063, + 1066, + 1067, + 1068, + 1069, + 1070, + 1071, + 1072, + 1073, + 1074, + 1077, + 1078, + 1079, + 1080, + 1081, + 1082, + 1085, + 1086, + 1087, + 1088, + 1089, + 1090, + 1091, + 1094, + 1095, + 1096, + 1097, + 1098, + 1099, + 1100, + 1101, + 1102, + 1105, + 1106, + 1107, + 1108, + 1109, + 1110, + 1113, + 1114, + 1115, + 1116, + 1117, + 1118, + 1119, + 1122, + 1123, + 1124, + 1125, + 1126, + 1127, + 1128, + 1129, + 1130, + 1133, + 1134, + 1135, + 1136, + 1137, + 1138, + 1141, + 1142, + 1143, + 1144, + 1145, + 1146, + 1147, + 1150, + 1151, + 1152, + 1153, + 1154, + 1155, + 1156, + 1157, + 1158, + 1161, + 1162, + 1163, + 1164, + 1165, + 1166, + 1169, + 1170, + 1171, + 1172, + 1173, + 1174, + 1175, + 1178, + 1179, + 1180, + 1181, + 1182, + 1183, + 1184, + 1185, + 1186, + 1189, + 1190, + 1191, + 1192, + 1193, + 1194, + 1197, + 1198, + 1199, + 1200, + 1201, + 1202, + 1203, + 1206, + 1207, + 1208, + 1209, + 1210, + 1211, + 1212, + 1213, + 1214, + 1217, + 1218, + 1219, + 1220, + 1221, + 1222, + 1225, + 1226, + 1227, + 1228, + 1229, + 1230, + 1231, + 1234, + 1235, + 1236, + 1237, + 1238, + 1239, + 1240, + 1241, + 1242, + 1245, + 1246, + 1247, + 1248, + 1249, + 1250, + 1253, + 1254, + 1255, + 1256, + 1257, + 1258, + 1259, + 1262, + 1263, + 1264, + 1265, + 1266, + 1267, + 1268, + 1269, + 1270, + 1273, + 1274, + 1275, + 1276, + 1277, + 1278, + 1281, + 1282, + 1283, + 1284, + 1285, + 1286, + 1287, + 1290, + 1291, + 1292, + 1293, + 1294, + 1295, + 1296, + 1297, + 1298, + 1301, + 1302, + 1303, + 1304, + 1305, + 1306, + 1309, + 1310, + 1311, + 1312, + 1313, + 1314, + 1315, + 1318, + 1319, + 1320, + 1321, + 1322, + 1323, + 1324, + 1325, + 1326, + 1329, + 1330, + 1331, + 1332, + 1333, + 1334, + 1337, + 1338, + 1339, + 1340, + 1341, + 1342, + 1343, + 1346, + 1347, + 1348, + 1349, + 1350, + 1351, + 1352, + 1353, + 1354, + 1357, + 1358, + 1359, + 1360, + 1361, + 1362, + 1363, + 1366, + 1367, + 1368, + 1369, + 1370, + 1371, + 1374, + 1375, + 1376, + 1377, + 1378, + 1379, + 1380, + 1383, + 1384, + 1385, + 1386, + 1387, + 1388, + 1389, + 1390, + 1391, + 1394, + 1395, + 1396, + 1397, + 1398, + 1399, + 1402, + 1403, + 1404, + 1405, + 1406, + 1407, + 1408, + 1411, + 1412, + 1413, + 1414, + 1415, + 1416, + 1417, + 1418, + 1419, + 1422, + 1423, + 1424, + 1425, + 1426, + 1427, + 1430, + 1432, + 1433, + 1434, + 1435, + 1436, + 1437, + 1438, + 1439, + 1442 + ], + "heads": [[1132, 0, 0]], + "attrs": {"mxnet_version": ["int", 10000]} +} \ No newline at end of file diff --git a/insightface/models/buffalo_l/1k3d68.onnx b/insightface/models/buffalo_l/1k3d68.onnx new file mode 100644 index 0000000000000000000000000000000000000000..221aa2f02a6faccddb2723529e1f93c7db2edbdc --- /dev/null +++ b/insightface/models/buffalo_l/1k3d68.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df5c06b8a0c12e422b2ed8947b8869faa4105387f199c477af038aa01f9a45cc +size 143607619 diff --git a/insightface/models/buffalo_l/2d106det.onnx b/insightface/models/buffalo_l/2d106det.onnx new file mode 100644 index 0000000000000000000000000000000000000000..cdb163d88b5f51396855ebc795e0114322c98b6b --- /dev/null +++ b/insightface/models/buffalo_l/2d106det.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f001b856447c413801ef5c42091ed0cd516fcd21f2d6b79635b1e733a7109dbf +size 5030888 diff --git a/insightface/models/buffalo_l/det_10g.onnx b/insightface/models/buffalo_l/det_10g.onnx new file mode 100644 index 0000000000000000000000000000000000000000..aa586e034379fa5ea5babc8aa73d47afcd0fa6c2 --- /dev/null +++ b/insightface/models/buffalo_l/det_10g.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5838f7fe053675b1c7a08b633df49e7af5495cee0493c7dcf6697200b85b5b91 +size 16923827 diff --git a/insightface/models/buffalo_l/genderage.onnx b/insightface/models/buffalo_l/genderage.onnx new file mode 100644 index 0000000000000000000000000000000000000000..fcf638481cea978e99ddabd914ccd3b70c8401cb --- /dev/null +++ b/insightface/models/buffalo_l/genderage.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fde69b1c810857b88c64a335084f1c3fe8f01246c9a191b48c7bb756d6652fb +size 1322532 diff --git a/insightface/models/buffalo_l/w600k_r50.onnx b/insightface/models/buffalo_l/w600k_r50.onnx new file mode 100644 index 0000000000000000000000000000000000000000..571d2bb9ffd76399b23260620b9101b20bcc4e99 --- /dev/null +++ b/insightface/models/buffalo_l/w600k_r50.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c06341c33c2ca1f86781dab0e829f88ad5b64be9fba56e56bc9ebdefc619e43 +size 174383860 diff --git a/insightface/models/genderage_v1/model-0000.params b/insightface/models/genderage_v1/model-0000.params new file mode 100644 index 0000000000000000000000000000000000000000..c18e6e1cfaa9bf61eebf9760efefdc8c923454a4 --- /dev/null +++ b/insightface/models/genderage_v1/model-0000.params @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01788b7eaa2516636cbd976fad7883164aaeb0bd4027e878ff457f79fe9021aa +size 1100856 diff --git a/insightface/models/genderage_v1/model-symbol.json b/insightface/models/genderage_v1/model-symbol.json new file mode 100644 index 0000000000000000000000000000000000000000..723235b1c97f55fa2ac6907487321a90b5bee895 --- /dev/null +++ b/insightface/models/genderage_v1/model-symbol.json @@ -0,0 +1,2399 @@ +{ + "nodes": [ + { + "op": "null", + "name": "data", + "inputs": [] + }, + { + "op": "_minus_scalar", + "name": "_minusscalar0", + "attrs": {"scalar": "127.5"}, + "inputs": [[0, 0, 0]] + }, + { + "op": "_mul_scalar", + "name": "_mulscalar0", + "attrs": {"scalar": "0.0078125"}, + "inputs": [[1, 0, 0]] + }, + { + "op": "null", + "name": "conv_1_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "8", + "num_group": "1", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_1_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "8", + "num_group": "1", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[2, 0, 0], [3, 0, 0]] + }, + { + "op": "null", + "name": "conv_1_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_1_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_1_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_1_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_1_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 1], [8, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_1_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[9, 0, 0]] + }, + { + "op": "null", + "name": "conv_2_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "8", + "num_group": "8", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_2_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "8", + "num_group": "8", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[10, 0, 0], [11, 0, 0]] + }, + { + "op": "null", + "name": "conv_2_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_2_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_2_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_2_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_2_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[12, 0, 0], [13, 0, 0], [14, 0, 0], [15, 0, 1], [16, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_2_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[17, 0, 0]] + }, + { + "op": "null", + "name": "conv_2_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "16", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_2_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "16", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[18, 0, 0], [19, 0, 0]] + }, + { + "op": "null", + "name": "conv_2_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_2_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_2_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_2_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_2_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[20, 0, 0], [21, 0, 0], [22, 0, 0], [23, 0, 1], [24, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_2_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[25, 0, 0]] + }, + { + "op": "null", + "name": "conv_3_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "16", + "num_group": "16", + "pad": "(1, 1)", + "stride": "(2, 2)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_3_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "16", + "num_group": "16", + "pad": "(1, 1)", + "stride": "(2, 2)" + }, + "inputs": [[26, 0, 0], [27, 0, 0]] + }, + { + "op": "null", + "name": "conv_3_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_3_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_3_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_3_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_3_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[28, 0, 0], [29, 0, 0], [30, 0, 0], [31, 0, 1], [32, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_3_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[33, 0, 0]] + }, + { + "op": "null", + "name": "conv_3_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "32", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_3_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "32", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[34, 0, 0], [35, 0, 0]] + }, + { + "op": "null", + "name": "conv_3_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_3_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_3_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_3_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_3_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[36, 0, 0], [37, 0, 0], [38, 0, 0], [39, 0, 1], [40, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_3_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[41, 0, 0]] + }, + { + "op": "null", + "name": "conv_4_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "32", + "num_group": "32", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_4_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "32", + "num_group": "32", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[42, 0, 0], [43, 0, 0]] + }, + { + "op": "null", + "name": "conv_4_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_4_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_4_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_4_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_4_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[44, 0, 0], [45, 0, 0], [46, 0, 0], [47, 0, 1], [48, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_4_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[49, 0, 0]] + }, + { + "op": "null", + "name": "conv_4_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "32", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_4_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "32", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[50, 0, 0], [51, 0, 0]] + }, + { + "op": "null", + "name": "conv_4_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_4_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_4_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_4_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_4_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[52, 0, 0], [53, 0, 0], [54, 0, 0], [55, 0, 1], [56, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_4_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[57, 0, 0]] + }, + { + "op": "null", + "name": "conv_5_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "32", + "num_group": "32", + "pad": "(1, 1)", + "stride": "(2, 2)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_5_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "32", + "num_group": "32", + "pad": "(1, 1)", + "stride": "(2, 2)" + }, + "inputs": [[58, 0, 0], [59, 0, 0]] + }, + { + "op": "null", + "name": "conv_5_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_5_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_5_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_5_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_5_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[60, 0, 0], [61, 0, 0], [62, 0, 0], [63, 0, 1], [64, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_5_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[65, 0, 0]] + }, + { + "op": "null", + "name": "conv_5_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_5_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[66, 0, 0], [67, 0, 0]] + }, + { + "op": "null", + "name": "conv_5_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_5_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_5_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_5_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_5_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[68, 0, 0], [69, 0, 0], [70, 0, 0], [71, 0, 1], [72, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_5_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[73, 0, 0]] + }, + { + "op": "null", + "name": "conv_6_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "num_group": "64", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_6_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "num_group": "64", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[74, 0, 0], [75, 0, 0]] + }, + { + "op": "null", + "name": "conv_6_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_6_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_6_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_6_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_6_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[76, 0, 0], [77, 0, 0], [78, 0, 0], [79, 0, 1], [80, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_6_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[81, 0, 0]] + }, + { + "op": "null", + "name": "conv_6_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_6_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[82, 0, 0], [83, 0, 0]] + }, + { + "op": "null", + "name": "conv_6_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_6_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_6_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_6_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_6_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[84, 0, 0], [85, 0, 0], [86, 0, 0], [87, 0, 1], [88, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_6_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[89, 0, 0]] + }, + { + "op": "null", + "name": "conv_7_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "num_group": "64", + "pad": "(1, 1)", + "stride": "(2, 2)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_7_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "64", + "num_group": "64", + "pad": "(1, 1)", + "stride": "(2, 2)" + }, + "inputs": [[90, 0, 0], [91, 0, 0]] + }, + { + "op": "null", + "name": "conv_7_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_7_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_7_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_7_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_7_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[92, 0, 0], [93, 0, 0], [94, 0, 0], [95, 0, 1], [96, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_7_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[97, 0, 0]] + }, + { + "op": "null", + "name": "conv_7_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_7_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[98, 0, 0], [99, 0, 0]] + }, + { + "op": "null", + "name": "conv_7_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_7_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_7_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_7_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_7_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[100, 0, 0], [101, 0, 0], [102, 0, 0], [103, 0, 1], [104, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_7_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[105, 0, 0]] + }, + { + "op": "null", + "name": "conv_8_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_8_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[106, 0, 0], [107, 0, 0]] + }, + { + "op": "null", + "name": "conv_8_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_8_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_8_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_8_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_8_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[108, 0, 0], [109, 0, 0], [110, 0, 0], [111, 0, 1], [112, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_8_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[113, 0, 0]] + }, + { + "op": "null", + "name": "conv_8_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_8_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[114, 0, 0], [115, 0, 0]] + }, + { + "op": "null", + "name": "conv_8_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_8_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_8_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_8_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_8_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[116, 0, 0], [117, 0, 0], [118, 0, 0], [119, 0, 1], [120, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_8_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[121, 0, 0]] + }, + { + "op": "null", + "name": "conv_9_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_9_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[122, 0, 0], [123, 0, 0]] + }, + { + "op": "null", + "name": "conv_9_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_9_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_9_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_9_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_9_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[124, 0, 0], [125, 0, 0], [126, 0, 0], [127, 0, 1], [128, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_9_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[129, 0, 0]] + }, + { + "op": "null", + "name": "conv_9_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_9_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[130, 0, 0], [131, 0, 0]] + }, + { + "op": "null", + "name": "conv_9_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_9_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_9_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_9_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_9_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[132, 0, 0], [133, 0, 0], [134, 0, 0], [135, 0, 1], [136, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_9_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[137, 0, 0]] + }, + { + "op": "null", + "name": "conv_10_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_10_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[138, 0, 0], [139, 0, 0]] + }, + { + "op": "null", + "name": "conv_10_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_10_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_10_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_10_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_10_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[140, 0, 0], [141, 0, 0], [142, 0, 0], [143, 0, 1], [144, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_10_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[145, 0, 0]] + }, + { + "op": "null", + "name": "conv_10_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_10_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[146, 0, 0], [147, 0, 0]] + }, + { + "op": "null", + "name": "conv_10_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_10_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_10_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_10_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_10_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[148, 0, 0], [149, 0, 0], [150, 0, 0], [151, 0, 1], [152, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_10_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[153, 0, 0]] + }, + { + "op": "null", + "name": "conv_11_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_11_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[154, 0, 0], [155, 0, 0]] + }, + { + "op": "null", + "name": "conv_11_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_11_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_11_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_11_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_11_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[156, 0, 0], [157, 0, 0], [158, 0, 0], [159, 0, 1], [160, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_11_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[161, 0, 0]] + }, + { + "op": "null", + "name": "conv_11_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_11_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[162, 0, 0], [163, 0, 0]] + }, + { + "op": "null", + "name": "conv_11_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_11_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_11_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_11_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_11_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[164, 0, 0], [165, 0, 0], [166, 0, 0], [167, 0, 1], [168, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_11_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[169, 0, 0]] + }, + { + "op": "null", + "name": "conv_12_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_12_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[170, 0, 0], [171, 0, 0]] + }, + { + "op": "null", + "name": "conv_12_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_12_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_12_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_12_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_12_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[172, 0, 0], [173, 0, 0], [174, 0, 0], [175, 0, 1], [176, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_12_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[177, 0, 0]] + }, + { + "op": "null", + "name": "conv_12_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_12_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[178, 0, 0], [179, 0, 0]] + }, + { + "op": "null", + "name": "conv_12_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_12_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_12_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_12_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_12_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[180, 0, 0], [181, 0, 0], [182, 0, 0], [183, 0, 1], [184, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_12_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[185, 0, 0]] + }, + { + "op": "null", + "name": "conv_13_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(2, 2)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_13_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "128", + "pad": "(1, 1)", + "stride": "(2, 2)" + }, + "inputs": [[186, 0, 0], [187, 0, 0]] + }, + { + "op": "null", + "name": "conv_13_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_13_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_13_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_13_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_13_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[188, 0, 0], [189, 0, 0], [190, 0, 0], [191, 0, 1], [192, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_13_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[193, 0, 0]] + }, + { + "op": "null", + "name": "conv_13_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_13_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[194, 0, 0], [195, 0, 0]] + }, + { + "op": "null", + "name": "conv_13_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_13_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_13_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_13_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_13_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[196, 0, 0], [197, 0, 0], [198, 0, 0], [199, 0, 1], [200, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_13_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[201, 0, 0]] + }, + { + "op": "null", + "name": "conv_14_dw_conv2d_weight", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "num_group": "256", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_14_dw_conv2d", + "attrs": { + "kernel": "(3, 3)", + "no_bias": "True", + "num_filter": "256", + "num_group": "256", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[202, 0, 0], [203, 0, 0]] + }, + { + "op": "null", + "name": "conv_14_dw_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_14_dw_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_14_dw_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_14_dw_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_14_dw_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[204, 0, 0], [205, 0, 0], [206, 0, 0], [207, 0, 1], [208, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_14_dw_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[209, 0, 0]] + }, + { + "op": "null", + "name": "conv_14_conv2d_weight", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv_14_conv2d", + "attrs": { + "kernel": "(1, 1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[210, 0, 0], [211, 0, 0]] + }, + { + "op": "null", + "name": "conv_14_batchnorm_gamma", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_14_batchnorm_beta", + "attrs": {"fix_gamma": "True"}, + "inputs": [] + }, + { + "op": "null", + "name": "conv_14_batchnorm_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "null", + "name": "conv_14_batchnorm_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "fix_gamma": "True" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "conv_14_batchnorm", + "attrs": {"fix_gamma": "True"}, + "inputs": [[212, 0, 0], [213, 0, 0], [214, 0, 0], [215, 0, 1], [216, 0, 1]] + }, + { + "op": "Activation", + "name": "conv_14_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[217, 0, 0]] + }, + { + "op": "null", + "name": "bn1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[218, 0, 0], [219, 0, 0], [220, 0, 0], [221, 0, 1], [222, 0, 1]] + }, + { + "op": "null", + "name": "relu1_gamma", + "attrs": { + "__init__": "[\"Constant\", {\"value\": 0.25}]", + "act_type": "prelu" + }, + "inputs": [] + }, + { + "op": "LeakyReLU", + "name": "relu1", + "attrs": {"act_type": "prelu"}, + "inputs": [[223, 0, 0], [224, 0, 0]] + }, + { + "op": "Pooling", + "name": "pool1", + "attrs": { + "global_pool": "True", + "kernel": "(7, 7)", + "pool_type": "avg" + }, + "inputs": [[225, 0, 0]] + }, + { + "op": "Flatten", + "name": "flatten0", + "inputs": [[226, 0, 0]] + }, + { + "op": "null", + "name": "pre_fc1_weight", + "attrs": {"num_hidden": "202"}, + "inputs": [] + }, + { + "op": "null", + "name": "pre_fc1_bias", + "attrs": {"num_hidden": "202"}, + "inputs": [] + }, + { + "op": "FullyConnected", + "name": "pre_fc1", + "attrs": {"num_hidden": "202"}, + "inputs": [[227, 0, 0], [228, 0, 0], [229, 0, 0]] + }, + { + "op": "null", + "name": "fc1_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "fc1_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "fc1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "fc1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "fc1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9" + }, + "inputs": [[230, 0, 0], [231, 0, 0], [232, 0, 0], [233, 0, 1], [234, 0, 1]] + } + ], + "arg_nodes": [ + 0, + 3, + 5, + 6, + 7, + 8, + 11, + 13, + 14, + 15, + 16, + 19, + 21, + 22, + 23, + 24, + 27, + 29, + 30, + 31, + 32, + 35, + 37, + 38, + 39, + 40, + 43, + 45, + 46, + 47, + 48, + 51, + 53, + 54, + 55, + 56, + 59, + 61, + 62, + 63, + 64, + 67, + 69, + 70, + 71, + 72, + 75, + 77, + 78, + 79, + 80, + 83, + 85, + 86, + 87, + 88, + 91, + 93, + 94, + 95, + 96, + 99, + 101, + 102, + 103, + 104, + 107, + 109, + 110, + 111, + 112, + 115, + 117, + 118, + 119, + 120, + 123, + 125, + 126, + 127, + 128, + 131, + 133, + 134, + 135, + 136, + 139, + 141, + 142, + 143, + 144, + 147, + 149, + 150, + 151, + 152, + 155, + 157, + 158, + 159, + 160, + 163, + 165, + 166, + 167, + 168, + 171, + 173, + 174, + 175, + 176, + 179, + 181, + 182, + 183, + 184, + 187, + 189, + 190, + 191, + 192, + 195, + 197, + 198, + 199, + 200, + 203, + 205, + 206, + 207, + 208, + 211, + 213, + 214, + 215, + 216, + 219, + 220, + 221, + 222, + 224, + 228, + 229, + 231, + 232, + 233, + 234 + ], + "node_row_ptr": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 252, + 253, + 254, + 255, + 256, + 257, + 258, + 259, + 262, + 263, + 264, + 265, + 266, + 267, + 268, + 269, + 272, + 273, + 274, + 275, + 276, + 277, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 288, + 289, + 290, + 291, + 294 + ], + "heads": [[235, 0, 0]], + "attrs": {"mxnet_version": ["int", 10300]} +} \ No newline at end of file diff --git a/insightface/models/retinaface_r50_v1/R50-0000.params b/insightface/models/retinaface_r50_v1/R50-0000.params new file mode 100644 index 0000000000000000000000000000000000000000..af0b5f4902d462345379021925ec24f8a1ac75ef --- /dev/null +++ b/insightface/models/retinaface_r50_v1/R50-0000.params @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20818d53adcefea4d3c4f31ba555910b9d052836588607af50af28cb414cb31e +size 118010124 diff --git a/insightface/models/retinaface_r50_v1/R50-symbol.json b/insightface/models/retinaface_r50_v1/R50-symbol.json new file mode 100644 index 0000000000000000000000000000000000000000..6bdbd0a87d2b3976f732f68874d1bf167f79bebc --- /dev/null +++ b/insightface/models/retinaface_r50_v1/R50-symbol.json @@ -0,0 +1,6979 @@ +{ + "nodes": [ + { + "op": "null", + "name": "data", + "inputs": [] + }, + { + "op": "null", + "name": "bn_data_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "bn_data_beta", + "inputs": [] + }, + { + "op": "null", + "name": "bn_data_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn_data_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "bn_data", + "attrs": { + "eps": "2e-05", + "fix_gamma": "True", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0]] + }, + { + "op": "null", + "name": "conv0_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "conv0", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(7,7)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(3,3)", + "stride": "(2,2)", + "workspace": "256" + }, + "inputs": [[5, 0, 0], [6, 0, 0]] + }, + { + "op": "null", + "name": "bn0_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "bn0_beta", + "inputs": [] + }, + { + "op": "null", + "name": "bn0_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn0_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "bn0", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[7, 0, 0], [8, 0, 0], [9, 0, 0], [10, 0, 0], [11, 0, 0]] + }, + { + "op": "Activation", + "name": "relu0", + "attrs": {"act_type": "relu"}, + "inputs": [[12, 0, 0]] + }, + { + "op": "Pooling", + "name": "pooling0", + "attrs": { + "global_pool": "False", + "kernel": "(3,3)", + "pad": "(1,1)", + "pool_type": "max", + "stride": "(2,2)" + }, + "inputs": [[13, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit1_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[14, 0, 0], [15, 0, 0], [16, 0, 0], [17, 0, 0], [18, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit1_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[19, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit1_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[20, 0, 0], [21, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit1_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[22, 0, 0], [23, 0, 0], [24, 0, 0], [25, 0, 0], [26, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit1_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[27, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit1_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[28, 0, 0], [29, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit1_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit1_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[30, 0, 0], [31, 0, 0], [32, 0, 0], [33, 0, 0], [34, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit1_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[35, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit1_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[36, 0, 0], [37, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit1_sc_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit1_sc", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[20, 0, 0], [39, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus0", + "inputs": [[38, 0, 0], [40, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit2_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[41, 0, 0], [42, 0, 0], [43, 0, 0], [44, 0, 0], [45, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit2_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[46, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit2_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[47, 0, 0], [48, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit2_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[49, 0, 0], [50, 0, 0], [51, 0, 0], [52, 0, 0], [53, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit2_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[54, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit2_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[55, 0, 0], [56, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit2_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit2_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[57, 0, 0], [58, 0, 0], [59, 0, 0], [60, 0, 0], [61, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit2_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[62, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit2_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit2_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[63, 0, 0], [64, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus1", + "inputs": [[65, 0, 0], [41, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit3_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[66, 0, 0], [67, 0, 0], [68, 0, 0], [69, 0, 0], [70, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit3_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[71, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit3_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[72, 0, 0], [73, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit3_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[74, 0, 0], [75, 0, 0], [76, 0, 0], [77, 0, 0], [78, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit3_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[79, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit3_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "64", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[80, 0, 0], [81, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage1_unit3_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage1_unit3_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[82, 0, 0], [83, 0, 0], [84, 0, 0], [85, 0, 0], [86, 0, 0]] + }, + { + "op": "Activation", + "name": "stage1_unit3_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[87, 0, 0]] + }, + { + "op": "null", + "name": "stage1_unit3_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage1_unit3_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[88, 0, 0], [89, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus2", + "inputs": [[90, 0, 0], [66, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit1_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[91, 0, 0], [92, 0, 0], [93, 0, 0], [94, 0, 0], [95, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit1_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[96, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit1_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[97, 0, 0], [98, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit1_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[99, 0, 0], [100, 0, 0], [101, 0, 0], [102, 0, 0], [103, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit1_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[104, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit1_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(1,1)", + "stride": "(2,2)", + "workspace": "256" + }, + "inputs": [[105, 0, 0], [106, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit1_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit1_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[107, 0, 0], [108, 0, 0], [109, 0, 0], [110, 0, 0], [111, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit1_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[112, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit1_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[113, 0, 0], [114, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit1_sc_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit1_sc", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(0,0)", + "stride": "(2,2)", + "workspace": "256" + }, + "inputs": [[97, 0, 0], [116, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus3", + "inputs": [[115, 0, 0], [117, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit2_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[118, 0, 0], [119, 0, 0], [120, 0, 0], [121, 0, 0], [122, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit2_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[123, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit2_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[124, 0, 0], [125, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit2_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[126, 0, 0], [127, 0, 0], [128, 0, 0], [129, 0, 0], [130, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit2_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[131, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit2_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[132, 0, 0], [133, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit2_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit2_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[134, 0, 0], [135, 0, 0], [136, 0, 0], [137, 0, 0], [138, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit2_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[139, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit2_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit2_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[140, 0, 0], [141, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus4", + "inputs": [[142, 0, 0], [118, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit3_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[143, 0, 0], [144, 0, 0], [145, 0, 0], [146, 0, 0], [147, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit3_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[148, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit3_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[149, 0, 0], [150, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit3_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[151, 0, 0], [152, 0, 0], [153, 0, 0], [154, 0, 0], [155, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit3_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[156, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit3_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[157, 0, 0], [158, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit3_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit3_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[159, 0, 0], [160, 0, 0], [161, 0, 0], [162, 0, 0], [163, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit3_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[164, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit3_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit3_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[165, 0, 0], [166, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus5", + "inputs": [[167, 0, 0], [143, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit4_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[168, 0, 0], [169, 0, 0], [170, 0, 0], [171, 0, 0], [172, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit4_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[173, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit4_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[174, 0, 0], [175, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit4_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[176, 0, 0], [177, 0, 0], [178, 0, 0], [179, 0, 0], [180, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit4_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[181, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit4_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "128", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[182, 0, 0], [183, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage2_unit4_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage2_unit4_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[184, 0, 0], [185, 0, 0], [186, 0, 0], [187, 0, 0], [188, 0, 0]] + }, + { + "op": "Activation", + "name": "stage2_unit4_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[189, 0, 0]] + }, + { + "op": "null", + "name": "stage2_unit4_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage2_unit4_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[190, 0, 0], [191, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus6", + "inputs": [[192, 0, 0], [168, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit1_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[193, 0, 0], [194, 0, 0], [195, 0, 0], [196, 0, 0], [197, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit1_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[198, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit1_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[199, 0, 0], [200, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit1_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[201, 0, 0], [202, 0, 0], [203, 0, 0], [204, 0, 0], [205, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit1_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[206, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit1_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(1,1)", + "stride": "(2,2)", + "workspace": "256" + }, + "inputs": [[207, 0, 0], [208, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit1_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit1_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[209, 0, 0], [210, 0, 0], [211, 0, 0], [212, 0, 0], [213, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit1_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[214, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit1_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "1024", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[215, 0, 0], [216, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit1_sc_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit1_sc", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "1024", + "num_group": "1", + "pad": "(0,0)", + "stride": "(2,2)", + "workspace": "256" + }, + "inputs": [[199, 0, 0], [218, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus7", + "inputs": [[217, 0, 0], [219, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit2_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[220, 0, 0], [221, 0, 0], [222, 0, 0], [223, 0, 0], [224, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit2_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[225, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit2_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[226, 0, 0], [227, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit2_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[228, 0, 0], [229, 0, 0], [230, 0, 0], [231, 0, 0], [232, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit2_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[233, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit2_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[234, 0, 0], [235, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit2_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit2_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[236, 0, 0], [237, 0, 0], [238, 0, 0], [239, 0, 0], [240, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit2_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[241, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit2_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit2_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "1024", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[242, 0, 0], [243, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus8", + "inputs": [[244, 0, 0], [220, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit3_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[245, 0, 0], [246, 0, 0], [247, 0, 0], [248, 0, 0], [249, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit3_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[250, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit3_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[251, 0, 0], [252, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit3_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[253, 0, 0], [254, 0, 0], [255, 0, 0], [256, 0, 0], [257, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit3_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[258, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit3_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[259, 0, 0], [260, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit3_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit3_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[261, 0, 0], [262, 0, 0], [263, 0, 0], [264, 0, 0], [265, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit3_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[266, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit3_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit3_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "1024", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[267, 0, 0], [268, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus9", + "inputs": [[269, 0, 0], [245, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit4_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[270, 0, 0], [271, 0, 0], [272, 0, 0], [273, 0, 0], [274, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit4_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[275, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit4_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[276, 0, 0], [277, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit4_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[278, 0, 0], [279, 0, 0], [280, 0, 0], [281, 0, 0], [282, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit4_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[283, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit4_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[284, 0, 0], [285, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit4_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit4_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[286, 0, 0], [287, 0, 0], [288, 0, 0], [289, 0, 0], [290, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit4_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[291, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit4_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit4_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "1024", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[292, 0, 0], [293, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus10", + "inputs": [[294, 0, 0], [270, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit5_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[295, 0, 0], [296, 0, 0], [297, 0, 0], [298, 0, 0], [299, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit5_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[300, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit5_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[301, 0, 0], [302, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit5_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[303, 0, 0], [304, 0, 0], [305, 0, 0], [306, 0, 0], [307, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit5_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[308, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit5_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[309, 0, 0], [310, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit5_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit5_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[311, 0, 0], [312, 0, 0], [313, 0, 0], [314, 0, 0], [315, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit5_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[316, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit5_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit5_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "1024", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[317, 0, 0], [318, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus11", + "inputs": [[319, 0, 0], [295, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit6_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[320, 0, 0], [321, 0, 0], [322, 0, 0], [323, 0, 0], [324, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit6_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[325, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit6_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[326, 0, 0], [327, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit6_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[328, 0, 0], [329, 0, 0], [330, 0, 0], [331, 0, 0], [332, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit6_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[333, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit6_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "256", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[334, 0, 0], [335, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage3_unit6_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage3_unit6_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[336, 0, 0], [337, 0, 0], [338, 0, 0], [339, 0, 0], [340, 0, 0]] + }, + { + "op": "Activation", + "name": "stage3_unit6_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[341, 0, 0]] + }, + { + "op": "null", + "name": "stage3_unit6_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage3_unit6_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "1024", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[342, 0, 0], [343, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus12", + "inputs": [[344, 0, 0], [320, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit1_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[345, 0, 0], [346, 0, 0], [347, 0, 0], [348, 0, 0], [349, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit1_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[350, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit1_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[351, 0, 0], [352, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit1_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[353, 0, 0], [354, 0, 0], [355, 0, 0], [356, 0, 0], [357, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit1_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[358, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit1_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(1,1)", + "stride": "(2,2)", + "workspace": "256" + }, + "inputs": [[359, 0, 0], [360, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit1_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit1_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[361, 0, 0], [362, 0, 0], [363, 0, 0], [364, 0, 0], [365, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit1_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[366, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit1_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "2048", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[367, 0, 0], [368, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit1_sc_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit1_sc", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "2048", + "num_group": "1", + "pad": "(0,0)", + "stride": "(2,2)", + "workspace": "256" + }, + "inputs": [[351, 0, 0], [370, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus13", + "inputs": [[369, 0, 0], [371, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit2_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[372, 0, 0], [373, 0, 0], [374, 0, 0], [375, 0, 0], [376, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit2_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[377, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit2_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[378, 0, 0], [379, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit2_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[380, 0, 0], [381, 0, 0], [382, 0, 0], [383, 0, 0], [384, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit2_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[385, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit2_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[386, 0, 0], [387, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit2_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit2_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[388, 0, 0], [389, 0, 0], [390, 0, 0], [391, 0, 0], [392, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit2_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[393, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit2_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit2_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "2048", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[394, 0, 0], [395, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus14", + "inputs": [[396, 0, 0], [372, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit3_bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[397, 0, 0], [398, 0, 0], [399, 0, 0], [400, 0, 0], [401, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit3_relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[402, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_conv1_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit3_conv1", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[403, 0, 0], [404, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_bn2_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn2_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn2_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn2_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit3_bn2", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[405, 0, 0], [406, 0, 0], [407, 0, 0], [408, 0, 0], [409, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit3_relu2", + "attrs": {"act_type": "relu"}, + "inputs": [[410, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_conv2_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit3_conv2", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(3,3)", + "no_bias": "True", + "num_filter": "512", + "num_group": "1", + "pad": "(1,1)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[411, 0, 0], [412, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_bn3_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn3_beta", + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn3_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "stage4_unit3_bn3_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "stage4_unit3_bn3", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[413, 0, 0], [414, 0, 0], [415, 0, 0], [416, 0, 0], [417, 0, 0]] + }, + { + "op": "Activation", + "name": "stage4_unit3_relu3", + "attrs": {"act_type": "relu"}, + "inputs": [[418, 0, 0]] + }, + { + "op": "null", + "name": "stage4_unit3_conv3_weight", + "inputs": [] + }, + { + "op": "Convolution", + "name": "stage4_unit3_conv3", + "attrs": { + "cudnn_tune": "limited_workspace", + "dilate": "(1,1)", + "kernel": "(1,1)", + "no_bias": "True", + "num_filter": "2048", + "num_group": "1", + "pad": "(0,0)", + "stride": "(1,1)", + "workspace": "256" + }, + "inputs": [[419, 0, 0], [420, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus15", + "inputs": [[421, 0, 0], [397, 0, 0]] + }, + { + "op": "null", + "name": "bn1_gamma", + "inputs": [] + }, + { + "op": "null", + "name": "bn1_beta", + "inputs": [] + }, + { + "op": "null", + "name": "bn1_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "null", + "name": "bn1_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "bn1", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9", + "use_global_stats": "False" + }, + "inputs": [[422, 0, 0], [423, 0, 0], [424, 0, 0], [425, 0, 0], [426, 0, 0]] + }, + { + "op": "Activation", + "name": "relu1", + "attrs": {"act_type": "relu"}, + "inputs": [[427, 0, 0]] + }, + { + "op": "null", + "name": "ssh_c3_lateral_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c3_lateral_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_c3_lateral", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "256", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[428, 0, 0], [429, 0, 0], [430, 0, 0]] + }, + { + "op": "null", + "name": "ssh_c3_lateral_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c3_lateral_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c3_lateral_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c3_lateral_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_c3_lateral_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[431, 0, 0], [432, 0, 0], [433, 0, 0], [434, 0, 1], [435, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_c3_lateral_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[436, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m3_det_conv1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_conv1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m3_det_conv1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[437, 0, 0], [438, 0, 0], [439, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m3_det_conv1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_conv1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_conv1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_conv1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m3_det_conv1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[440, 0, 0], [441, 0, 0], [442, 0, 0], [443, 0, 1], [444, 0, 1]] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m3_det_context_conv1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[437, 0, 0], [446, 0, 0], [447, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m3_det_context_conv1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[448, 0, 0], [449, 0, 0], [450, 0, 0], [451, 0, 1], [452, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_m3_det_context_conv1_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[453, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv2_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv2_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m3_det_context_conv2", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[454, 0, 0], [455, 0, 0], [456, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv2_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv2_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv2_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv2_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m3_det_context_conv2_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[457, 0, 0], [458, 0, 0], [459, 0, 0], [460, 0, 1], [461, 0, 1]] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m3_det_context_conv3_1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[454, 0, 0], [463, 0, 0], [464, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m3_det_context_conv3_1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[465, 0, 0], [466, 0, 0], [467, 0, 0], [468, 0, 1], [469, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_m3_det_context_conv3_1_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[470, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_2_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_2_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m3_det_context_conv3_2", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[471, 0, 0], [472, 0, 0], [473, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_2_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_2_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_2_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m3_det_context_conv3_2_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m3_det_context_conv3_2_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[474, 0, 0], [475, 0, 0], [476, 0, 0], [477, 0, 1], [478, 0, 1]] + }, + { + "op": "Concat", + "name": "ssh_m3_det_concat", + "attrs": { + "dim": "1", + "num_args": "3" + }, + "inputs": [[445, 0, 0], [462, 0, 0], [479, 0, 0]] + }, + { + "op": "Activation", + "name": "ssh_m3_det_concat_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[480, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_cls_score_stride32_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_cls_score_stride32_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_cls_score_stride32", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "4", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[481, 0, 0], [482, 0, 0], [483, 0, 0]] + }, + { + "op": "Reshape", + "name": "face_rpn_cls_score_reshape_stride32", + "attrs": {"shape": "(0, 2, -1, 0)"}, + "inputs": [[484, 0, 0]] + }, + { + "op": "SoftmaxActivation", + "name": "face_rpn_cls_prob_stride32", + "attrs": {"mode": "channel"}, + "inputs": [[485, 0, 0]] + }, + { + "op": "Reshape", + "name": "face_rpn_cls_prob_reshape_stride32", + "attrs": {"shape": "(0, 4, -1, 0)"}, + "inputs": [[486, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_bbox_pred_stride32_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_bbox_pred_stride32_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_bbox_pred_stride32", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "8", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[481, 0, 0], [488, 0, 0], [489, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_landmark_pred_stride32_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_landmark_pred_stride32_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_landmark_pred_stride32", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "20", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[481, 0, 0], [491, 0, 0], [492, 0, 0]] + }, + { + "op": "null", + "name": "ssh_c2_lateral_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c2_lateral_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_c2_lateral", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "256", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[359, 0, 0], [494, 0, 0], [495, 0, 0]] + }, + { + "op": "null", + "name": "ssh_c2_lateral_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c2_lateral_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c2_lateral_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c2_lateral_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_c2_lateral_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[496, 0, 0], [497, 0, 0], [498, 0, 0], [499, 0, 1], [500, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_c2_lateral_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[501, 0, 0]] + }, + { + "op": "UpSampling", + "name": "ssh_c3_up", + "attrs": { + "num_args": "1", + "sample_type": "nearest", + "scale": "2", + "workspace": "512" + }, + "inputs": [[437, 0, 0]] + }, + { + "op": "Crop", + "name": "crop0", + "attrs": {"num_args": "2"}, + "inputs": [[503, 0, 0], [502, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus0", + "inputs": [[502, 0, 0], [504, 0, 0]] + }, + { + "op": "null", + "name": "ssh_c2_aggr_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c2_aggr_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_c2_aggr", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[505, 0, 0], [506, 0, 0], [507, 0, 0]] + }, + { + "op": "null", + "name": "ssh_c2_aggr_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c2_aggr_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c2_aggr_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c2_aggr_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_c2_aggr_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[508, 0, 0], [509, 0, 0], [510, 0, 0], [511, 0, 1], [512, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_c2_aggr_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[513, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m2_det_conv1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_conv1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m2_det_conv1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[514, 0, 0], [515, 0, 0], [516, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m2_det_conv1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_conv1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_conv1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_conv1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m2_det_conv1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[517, 0, 0], [518, 0, 0], [519, 0, 0], [520, 0, 1], [521, 0, 1]] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m2_det_context_conv1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[514, 0, 0], [523, 0, 0], [524, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m2_det_context_conv1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[525, 0, 0], [526, 0, 0], [527, 0, 0], [528, 0, 1], [529, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_m2_det_context_conv1_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[530, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv2_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv2_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m2_det_context_conv2", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[531, 0, 0], [532, 0, 0], [533, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv2_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv2_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv2_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv2_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m2_det_context_conv2_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[534, 0, 0], [535, 0, 0], [536, 0, 0], [537, 0, 1], [538, 0, 1]] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m2_det_context_conv3_1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[531, 0, 0], [540, 0, 0], [541, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m2_det_context_conv3_1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[542, 0, 0], [543, 0, 0], [544, 0, 0], [545, 0, 1], [546, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_m2_det_context_conv3_1_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[547, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_2_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_2_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m2_det_context_conv3_2", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[548, 0, 0], [549, 0, 0], [550, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_2_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_2_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_2_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m2_det_context_conv3_2_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m2_det_context_conv3_2_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[551, 0, 0], [552, 0, 0], [553, 0, 0], [554, 0, 1], [555, 0, 1]] + }, + { + "op": "Concat", + "name": "ssh_m2_det_concat", + "attrs": { + "dim": "1", + "num_args": "3" + }, + "inputs": [[522, 0, 0], [539, 0, 0], [556, 0, 0]] + }, + { + "op": "Activation", + "name": "ssh_m2_det_concat_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[557, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_cls_score_stride16_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_cls_score_stride16_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_cls_score_stride16", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "4", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[558, 0, 0], [559, 0, 0], [560, 0, 0]] + }, + { + "op": "Reshape", + "name": "face_rpn_cls_score_reshape_stride16", + "attrs": {"shape": "(0, 2, -1, 0)"}, + "inputs": [[561, 0, 0]] + }, + { + "op": "SoftmaxActivation", + "name": "face_rpn_cls_prob_stride16", + "attrs": {"mode": "channel"}, + "inputs": [[562, 0, 0]] + }, + { + "op": "Reshape", + "name": "face_rpn_cls_prob_reshape_stride16", + "attrs": {"shape": "(0, 4, -1, 0)"}, + "inputs": [[563, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_bbox_pred_stride16_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_bbox_pred_stride16_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_bbox_pred_stride16", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "8", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[558, 0, 0], [565, 0, 0], [566, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_landmark_pred_stride16_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_landmark_pred_stride16_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_landmark_pred_stride16", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "20", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[558, 0, 0], [568, 0, 0], [569, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_red_conv_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_red_conv_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m1_red_conv", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "256", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[207, 0, 0], [571, 0, 0], [572, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_red_conv_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_red_conv_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_red_conv_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_red_conv_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m1_red_conv_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[573, 0, 0], [574, 0, 0], [575, 0, 0], [576, 0, 1], [577, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_m1_red_conv_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[578, 0, 0]] + }, + { + "op": "UpSampling", + "name": "ssh_m2_red_up", + "attrs": { + "num_args": "1", + "sample_type": "nearest", + "scale": "2", + "workspace": "512" + }, + "inputs": [[514, 0, 0]] + }, + { + "op": "Crop", + "name": "crop1", + "attrs": {"num_args": "2"}, + "inputs": [[580, 0, 0], [579, 0, 0]] + }, + { + "op": "elemwise_add", + "name": "_plus1", + "inputs": [[579, 0, 0], [581, 0, 0]] + }, + { + "op": "null", + "name": "ssh_c1_aggr_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c1_aggr_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_c1_aggr", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[582, 0, 0], [583, 0, 0], [584, 0, 0]] + }, + { + "op": "null", + "name": "ssh_c1_aggr_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c1_aggr_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c1_aggr_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_c1_aggr_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_c1_aggr_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[585, 0, 0], [586, 0, 0], [587, 0, 0], [588, 0, 1], [589, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_c1_aggr_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[590, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_det_conv1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_conv1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m1_det_conv1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "256", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[591, 0, 0], [592, 0, 0], [593, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_det_conv1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_conv1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_conv1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_conv1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m1_det_conv1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[594, 0, 0], [595, 0, 0], [596, 0, 0], [597, 0, 1], [598, 0, 1]] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m1_det_context_conv1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[591, 0, 0], [600, 0, 0], [601, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m1_det_context_conv1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[602, 0, 0], [603, 0, 0], [604, 0, 0], [605, 0, 1], [606, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_m1_det_context_conv1_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[607, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv2_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv2_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m1_det_context_conv2", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[608, 0, 0], [609, 0, 0], [610, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv2_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv2_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv2_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv2_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m1_det_context_conv2_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[611, 0, 0], [612, 0, 0], [613, 0, 0], [614, 0, 1], [615, 0, 1]] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_1_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_1_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m1_det_context_conv3_1", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[608, 0, 0], [617, 0, 0], [618, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_1_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_1_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_1_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_1_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m1_det_context_conv3_1_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[619, 0, 0], [620, 0, 0], [621, 0, 0], [622, 0, 1], [623, 0, 1]] + }, + { + "op": "Activation", + "name": "ssh_m1_det_context_conv3_1_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[624, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_2_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_2_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "ssh_m1_det_context_conv3_2", + "attrs": { + "kernel": "(3, 3)", + "num_filter": "128", + "pad": "(1, 1)", + "stride": "(1, 1)" + }, + "inputs": [[625, 0, 0], [626, 0, 0], [627, 0, 0]] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_2_bn_gamma", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_2_bn_beta", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_2_bn_moving_mean", + "attrs": { + "__init__": "[\"zero\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "null", + "name": "ssh_m1_det_context_conv3_2_bn_moving_var", + "attrs": { + "__init__": "[\"one\", {}]", + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [] + }, + { + "op": "BatchNorm", + "name": "ssh_m1_det_context_conv3_2_bn", + "attrs": { + "eps": "2e-05", + "fix_gamma": "False", + "momentum": "0.9" + }, + "inputs": [[628, 0, 0], [629, 0, 0], [630, 0, 0], [631, 0, 1], [632, 0, 1]] + }, + { + "op": "Concat", + "name": "ssh_m1_det_concat", + "attrs": { + "dim": "1", + "num_args": "3" + }, + "inputs": [[599, 0, 0], [616, 0, 0], [633, 0, 0]] + }, + { + "op": "Activation", + "name": "ssh_m1_det_concat_relu", + "attrs": {"act_type": "relu"}, + "inputs": [[634, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_cls_score_stride8_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_cls_score_stride8_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_cls_score_stride8", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "4", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[635, 0, 0], [636, 0, 0], [637, 0, 0]] + }, + { + "op": "Reshape", + "name": "face_rpn_cls_score_reshape_stride8", + "attrs": {"shape": "(0, 2, -1, 0)"}, + "inputs": [[638, 0, 0]] + }, + { + "op": "SoftmaxActivation", + "name": "face_rpn_cls_prob_stride8", + "attrs": {"mode": "channel"}, + "inputs": [[639, 0, 0]] + }, + { + "op": "Reshape", + "name": "face_rpn_cls_prob_reshape_stride8", + "attrs": {"shape": "(0, 4, -1, 0)"}, + "inputs": [[640, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_bbox_pred_stride8_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_bbox_pred_stride8_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_bbox_pred_stride8", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "8", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[635, 0, 0], [642, 0, 0], [643, 0, 0]] + }, + { + "op": "null", + "name": "face_rpn_landmark_pred_stride8_weight", + "attrs": { + "__init__": "[\"normal\", {\"sigma\": 0.01}]", + "__lr_mult__": "1.0" + }, + "inputs": [] + }, + { + "op": "null", + "name": "face_rpn_landmark_pred_stride8_bias", + "attrs": { + "__init__": "[\"constant\", {\"value\": 0.0}]", + "__lr_mult__": "2.0", + "__wd_mult__": "0.0" + }, + "inputs": [] + }, + { + "op": "Convolution", + "name": "face_rpn_landmark_pred_stride8", + "attrs": { + "kernel": "(1, 1)", + "num_filter": "20", + "pad": "(0, 0)", + "stride": "(1, 1)" + }, + "inputs": [[635, 0, 0], [645, 0, 0], [646, 0, 0]] + } + ], + "arg_nodes": [ + 0, + 1, + 2, + 3, + 4, + 6, + 8, + 9, + 10, + 11, + 15, + 16, + 17, + 18, + 21, + 23, + 24, + 25, + 26, + 29, + 31, + 32, + 33, + 34, + 37, + 39, + 42, + 43, + 44, + 45, + 48, + 50, + 51, + 52, + 53, + 56, + 58, + 59, + 60, + 61, + 64, + 67, + 68, + 69, + 70, + 73, + 75, + 76, + 77, + 78, + 81, + 83, + 84, + 85, + 86, + 89, + 92, + 93, + 94, + 95, + 98, + 100, + 101, + 102, + 103, + 106, + 108, + 109, + 110, + 111, + 114, + 116, + 119, + 120, + 121, + 122, + 125, + 127, + 128, + 129, + 130, + 133, + 135, + 136, + 137, + 138, + 141, + 144, + 145, + 146, + 147, + 150, + 152, + 153, + 154, + 155, + 158, + 160, + 161, + 162, + 163, + 166, + 169, + 170, + 171, + 172, + 175, + 177, + 178, + 179, + 180, + 183, + 185, + 186, + 187, + 188, + 191, + 194, + 195, + 196, + 197, + 200, + 202, + 203, + 204, + 205, + 208, + 210, + 211, + 212, + 213, + 216, + 218, + 221, + 222, + 223, + 224, + 227, + 229, + 230, + 231, + 232, + 235, + 237, + 238, + 239, + 240, + 243, + 246, + 247, + 248, + 249, + 252, + 254, + 255, + 256, + 257, + 260, + 262, + 263, + 264, + 265, + 268, + 271, + 272, + 273, + 274, + 277, + 279, + 280, + 281, + 282, + 285, + 287, + 288, + 289, + 290, + 293, + 296, + 297, + 298, + 299, + 302, + 304, + 305, + 306, + 307, + 310, + 312, + 313, + 314, + 315, + 318, + 321, + 322, + 323, + 324, + 327, + 329, + 330, + 331, + 332, + 335, + 337, + 338, + 339, + 340, + 343, + 346, + 347, + 348, + 349, + 352, + 354, + 355, + 356, + 357, + 360, + 362, + 363, + 364, + 365, + 368, + 370, + 373, + 374, + 375, + 376, + 379, + 381, + 382, + 383, + 384, + 387, + 389, + 390, + 391, + 392, + 395, + 398, + 399, + 400, + 401, + 404, + 406, + 407, + 408, + 409, + 412, + 414, + 415, + 416, + 417, + 420, + 423, + 424, + 425, + 426, + 429, + 430, + 432, + 433, + 434, + 435, + 438, + 439, + 441, + 442, + 443, + 444, + 446, + 447, + 449, + 450, + 451, + 452, + 455, + 456, + 458, + 459, + 460, + 461, + 463, + 464, + 466, + 467, + 468, + 469, + 472, + 473, + 475, + 476, + 477, + 478, + 482, + 483, + 488, + 489, + 491, + 492, + 494, + 495, + 497, + 498, + 499, + 500, + 506, + 507, + 509, + 510, + 511, + 512, + 515, + 516, + 518, + 519, + 520, + 521, + 523, + 524, + 526, + 527, + 528, + 529, + 532, + 533, + 535, + 536, + 537, + 538, + 540, + 541, + 543, + 544, + 545, + 546, + 549, + 550, + 552, + 553, + 554, + 555, + 559, + 560, + 565, + 566, + 568, + 569, + 571, + 572, + 574, + 575, + 576, + 577, + 583, + 584, + 586, + 587, + 588, + 589, + 592, + 593, + 595, + 596, + 597, + 598, + 600, + 601, + 603, + 604, + 605, + 606, + 609, + 610, + 612, + 613, + 614, + 615, + 617, + 618, + 620, + 621, + 622, + 623, + 626, + 627, + 629, + 630, + 631, + 632, + 636, + 637, + 642, + 643, + 645, + 646 + ], + "node_row_ptr": [ + 0, + 1, + 2, + 3, + 4, + 5, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 257, + 258, + 259, + 260, + 261, + 262, + 263, + 264, + 267, + 268, + 269, + 270, + 271, + 272, + 273, + 274, + 275, + 276, + 277, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 287, + 290, + 291, + 292, + 293, + 294, + 295, + 296, + 297, + 300, + 301, + 302, + 303, + 304, + 305, + 306, + 307, + 308, + 311, + 312, + 313, + 314, + 315, + 316, + 317, + 318, + 321, + 322, + 323, + 324, + 325, + 326, + 327, + 328, + 331, + 332, + 333, + 334, + 335, + 336, + 337, + 338, + 339, + 342, + 343, + 344, + 345, + 346, + 347, + 348, + 349, + 352, + 353, + 354, + 355, + 356, + 357, + 358, + 359, + 362, + 363, + 364, + 365, + 366, + 367, + 368, + 369, + 370, + 373, + 374, + 375, + 376, + 377, + 378, + 379, + 380, + 383, + 384, + 385, + 386, + 387, + 388, + 389, + 390, + 393, + 394, + 395, + 396, + 397, + 398, + 399, + 400, + 401, + 404, + 405, + 406, + 407, + 408, + 409, + 410, + 411, + 414, + 415, + 416, + 417, + 418, + 419, + 420, + 421, + 424, + 425, + 426, + 427, + 428, + 429, + 430, + 431, + 432, + 435, + 436, + 437, + 438, + 439, + 440, + 441, + 442, + 445, + 446, + 447, + 448, + 449, + 450, + 451, + 452, + 455, + 456, + 457, + 458, + 459, + 460, + 461, + 462, + 463, + 464, + 465, + 468, + 469, + 470, + 471, + 472, + 473, + 474, + 475, + 478, + 479, + 480, + 481, + 482, + 483, + 484, + 485, + 488, + 489, + 490, + 491, + 492, + 493, + 494, + 495, + 496, + 499, + 500, + 501, + 502, + 503, + 504, + 505, + 506, + 509, + 510, + 511, + 512, + 513, + 514, + 515, + 516, + 519, + 520, + 521, + 522, + 523, + 524, + 525, + 526, + 527, + 530, + 531, + 532, + 533, + 534, + 535, + 536, + 537, + 538, + 541, + 542, + 543, + 544, + 545, + 546, + 547, + 548, + 549, + 552, + 553, + 554, + 555, + 556, + 557, + 558, + 559, + 562, + 563, + 564, + 565, + 566, + 567, + 568, + 569, + 570, + 573, + 574, + 575, + 576, + 577, + 578, + 579, + 580, + 583, + 584, + 585, + 586, + 587, + 588, + 589, + 590, + 591, + 594, + 595, + 596, + 597, + 598, + 599, + 600, + 601, + 602, + 603, + 604, + 605, + 606, + 607, + 608, + 609, + 610, + 611, + 612, + 613, + 614, + 615, + 618, + 619, + 620, + 621, + 622, + 623, + 624, + 625, + 626, + 627, + 628, + 629, + 632, + 633, + 634, + 635, + 636, + 637, + 638, + 639, + 640, + 643, + 644, + 645, + 646, + 647, + 648, + 649, + 650, + 653, + 654, + 655, + 656, + 657, + 658, + 659, + 660, + 661, + 664, + 665, + 666, + 667, + 668, + 669, + 670, + 671, + 674, + 675, + 676, + 677, + 678, + 679, + 680, + 681, + 682, + 685, + 686, + 687, + 688, + 689, + 690, + 691, + 692, + 693, + 694, + 695, + 696, + 697, + 698, + 699, + 700, + 701, + 702, + 703, + 704, + 705, + 706, + 709, + 710, + 711, + 712, + 713, + 714, + 715, + 716, + 717, + 718, + 719, + 720, + 723, + 724, + 725, + 726, + 727, + 728, + 729, + 730, + 731, + 734, + 735, + 736, + 737, + 738, + 739, + 740, + 741, + 744, + 745, + 746, + 747, + 748, + 749, + 750, + 751, + 752, + 755, + 756, + 757, + 758, + 759, + 760, + 761, + 762, + 765, + 766, + 767, + 768, + 769, + 770, + 771, + 772, + 773, + 776, + 777, + 778, + 779, + 780, + 781, + 782, + 783, + 784, + 785, + 786, + 787, + 788, + 789, + 790 + ], + "heads": [[487, 0, 0], [490, 0, 0], [493, 0, 0], [564, 0, 0], [567, 0, 0], [570, 0, 0], [641, 0, 0], [644, 0, 0], [647, 0, 0]], + "attrs": {"mxnet_version": ["int", 10300]} +} \ No newline at end of file diff --git a/insightface/models/scrfd_10g/model.pth b/insightface/models/scrfd_10g/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..b7d98ca2339b57fa68b426beac1b67faffd30934 --- /dev/null +++ b/insightface/models/scrfd_10g/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:963570df5e0ebf6bb313239d0f9f3f0c096c1ff6937e8e28e45abad4d8b1d5c7 +size 15545065 diff --git a/insightface/models/scrfd_10g_bnkps/model.pth b/insightface/models/scrfd_10g_bnkps/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..92b92fea06a90124bb372f1e6b2bb557fe7cdaed --- /dev/null +++ b/insightface/models/scrfd_10g_bnkps/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d431436577d01c827abd78aa40c782b8fb318c26555ac60582144aaf66867411 +size 17005828 diff --git a/insightface/models/scrfd_1g/model.pth b/insightface/models/scrfd_1g/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb603e38d020bac3dbc62e5014f40e4fa236b3ee --- /dev/null +++ b/insightface/models/scrfd_1g/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d7d654c992c1581270461466a52c876234ad8be0ad8de37b9782d9f03beb86 +size 2647067 diff --git a/insightface/models/scrfd_2.5g/model.pth b/insightface/models/scrfd_2.5g/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..41a4f98d5e44125dce51a2c53f68524af4cee43d --- /dev/null +++ b/insightface/models/scrfd_2.5g/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe1d35ac6e0859307067bc3ccd44973b536b451437d23547fc460a05d00993f +size 2781443 diff --git a/insightface/models/scrfd_2.5g_bnkps/model.pth b/insightface/models/scrfd_2.5g_bnkps/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..560a7d45b0c27f02fe62b6afa0b3e5ae4d899e46 --- /dev/null +++ b/insightface/models/scrfd_2.5g_bnkps/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3db3b99c09e9212e9f2bb3970f6e641ec1812f27b19753f68289326067209662 +size 3346972 diff --git a/insightface/models/scrfd_34g/model.pth b/insightface/models/scrfd_34g/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..84d0571aef546c73f65d286d60d809bf39d4d723 --- /dev/null +++ b/insightface/models/scrfd_34g/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6f69956639da31c96d8985c9a0ce1f5798f42cb64909159596e7a5f544ebe00 +size 39677731 diff --git a/insightface/models/scrfd_500m/model.pth b/insightface/models/scrfd_500m/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..6af985fb9cc04a14d50c9f3bdd157db7a6203ccc --- /dev/null +++ b/insightface/models/scrfd_500m/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1043ab96cff67ee8ebb5fc2819f23f3620a128d133f5b5234cd2aedeeb83b5f0 +size 2404021 diff --git a/insightface/models/scrfd_500m_bnkps/model.pth b/insightface/models/scrfd_500m_bnkps/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..49006a6d5fa5ff05a58338bb07d32885626fb7d0 --- /dev/null +++ b/insightface/models/scrfd_500m_bnkps/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b04315df8db019067edacaceb73484e531981442f321432b8bf003e9812d6b3d +size 2669108 diff --git a/insightface/models/scrfd_person_2.5g.onnx b/insightface/models/scrfd_person_2.5g.onnx new file mode 100644 index 0000000000000000000000000000000000000000..2b896125e58600489b082a57d275808267cb8e68 --- /dev/null +++ b/insightface/models/scrfd_person_2.5g.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76522ba15eecb0712780509e912884aba066e9834be0c85761918cdcf76de5b5 +size 3710223 diff --git a/insightface/models/synthetic_resnet50d.ckpt b/insightface/models/synthetic_resnet50d.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..4dbe94011cafed1d9ef7849f7a1f48c008a3b60a --- /dev/null +++ b/insightface/models/synthetic_resnet50d.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01b3d5533999da3e605e5b9d99fb0a2a55e634467346c7504e3fbf778cfb219e +size 190838028 diff --git a/talknet-asd/.dockerignore b/talknet-asd/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..99043e7850b45dd4f38910807926a4aa7dc985cc --- /dev/null +++ b/talknet-asd/.dockerignore @@ -0,0 +1,20 @@ +# The .dockerignore file excludes files from the container build process. +# +# https://docs.docker.com/engine/reference/builder/#dockerignore-file + +# Cog +/demo/* + +# Exclude Git files +.git +.github +.gitignore + +# Exclude Python cache files +__pycache__ +.mypy_cache +.pytest_cache +.ruff_cache + +# Exclude Python virtual environment +/venv diff --git a/talknet-asd/.gitignore b/talknet-asd/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..18ca3367b3afd418106d3d2105142f6507ca2672 --- /dev/null +++ b/talknet-asd/.gitignore @@ -0,0 +1,118 @@ +# Other files +*.model +*.pth +*.wav +*.mp4 +*.txt +*.pcm +*.avi +data/ +tests/ +exps/ +/demo/* +.cog + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ \ No newline at end of file diff --git a/talknet-asd/FAQ.md b/talknet-asd/FAQ.md new file mode 100644 index 0000000000000000000000000000000000000000..df21a3fa0f04beae897c55e401d489bc1dc163e2 --- /dev/null +++ b/talknet-asd/FAQ.md @@ -0,0 +1,54 @@ +## 1. General Question + +### 1.1 Which dataset is used for training and testing ? +'pretrain_AVA.model' is trained on the AVA training set and evaluate on the AVA val/test set, (Has the entire code) +'pretrain_TalkSet.model' is trained on our TalkSet and evaluate on the Columbia ASD set or other raw videos. + +### 1.2 How to figure the variable length of data during training ? +We design a scheme to feed the variable-length data into the same mini-batch: we sort all videos by their length and put the videos with similar length into the same batch. We crop all videos into the minimum number of frames in this batch. In this way, we train the TalkNet with videos of different length without losing too much data. + +### 1.3 How to figure multiple faces on the screen ? +In the ASD task, when there are multiple face tracks in the video, we consider one track at a time. The face track of interest is given in each test trial. You can also consider the relationship between the faces on the screen at the same time. There are some papers about that. +### 1.4 Error: RuntimeError: CUDA error: no kernel image is available for execution on the device +Do "pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U", check this [page](https://github.com/pytorch/pytorch/issues/31285#issuecomment-739139454). + +### 1.5 Can not download csv, video data or pretrain model ? +I use google drive to upload the pretrain model and csv files. So you need to make sure you can use google drive under your internet. The error during extract video clips can be ignored. + +*** + +## 2. TalkNet in AVA-Activespeaker dataset + +### 2.1 Can not reimplement the result ? +In our experiments, for the result in AVA validation set, for the same code/model, the best training result is 92.6mAP, the worst one is 92.1mAP. So it is reasonable if you get the result little different than this 92.3mAP. Also batchsize might effect the result (not too much). + +### 2.2 How to get the result in AVA test set ? +AVA test set did not release the labels. So you need to upload your csv result in their system. Notice that we delete add the first line in the `test_res.csv` file since we modify a bit for the `get_ava_active_speaker_performance.py`. You need to delete the first line when you upload it. For the upload file, you need to set all `label` as `SPEAKING_AUDIBLE`. + +### 2.3 What are the labels ? Where is SPEAKING_BUT_NOT_AUDIBLE ? +There are three labels: SPEAKING_AND_AUDIBLE, SPEAKING_BUT_NOT_AUDIBLE, NOT_SPEAKING, but in the finally evaluation, SPEAKING_BUT_NOT_AUDIBLE and NOT_SPEAKING share the same label. So this is a binary classification issue. + +### 2.4 How big your model ? How long for training? +Our model has 15.01M params, in one 22G GPU, each epoch we train 15 mins, evaluate in val set takes 5 mins. Train 25 epochs can get the best result. So at most 7 hours. + +*** + +## 3. TalkNet in TalkSet and Columbia ASD dataset + +### 3.1 Why you build TalkSet instead of only use AVA dataset ? +Because we want our model can be used for all videos in the wild. AVA dataset has already provide the face bounding box for each trial, so for the videos not in AVA. If you want to do ASD, you need to do face detection first. In our experiments, the face detection method used in AVA is hard to reimplement. Which means we can hardly get the face area that similar to the detected face in AVA. Due to that, the model trained in AVA can not perform well in videos outside AVA if we use other face detection method. +Due to that, we build TalkSet, the face in TalkSet is all detected by S3FD. So for any raw video (Such as the videos in Col ASD dataset), we can use S3FD to do face detection first, then apply our TalkNet model to get the ASD label. + +### 3.2 TalkSet code can not work? +We did not verify this code. Because we just modify LRS3 and VoxCeleb2 to build this set, so we do not (or cannot) upload this set. This `generate_TalkSet.py` is what we used to generate this dataset, and we did not check it later. So it just used for your reference. We have already provide the data list, so you can generate this dataset based on it. + +*** + +## 4. An ASD Demo with pretrained TalkNet model + +### 4.1 I try the demo, the performance is not so good ? +You can check the demo video `001.mp4` first and compare your output and our result `001_res.mp4` to make sure what you did is correct. So if you are not statisfied with the performance. We are sorry about that (~cry). We think this model can further improve. For the very short clips (less than 1s), small face and side face, the performance is not so good. + +### 4.2 I try the demo, the face can not be detected ? +That is the reason for the face detection model instead of the ASD part. You can use better face detecion model such as [Insightface](https://github.com/deepinsight/insightface +). Only when the face can be detected, ASD model can perform to get the ASD labels. diff --git a/talknet-asd/LICENSE.md b/talknet-asd/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..d7a9a82afe523f2ab50ac259f8c8899b7eeaaa77 --- /dev/null +++ b/talknet-asd/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Tao Ruijie + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/talknet-asd/README.md b/talknet-asd/README.md new file mode 100644 index 0000000000000000000000000000000000000000..99178b2bb213e4c5eb3420d776206d1eee70dc51 --- /dev/null +++ b/talknet-asd/README.md @@ -0,0 +1,146 @@ +## Is someone talking? TalkNet: Audio-visual active speaker detection Model + +This repository contains the code for our ACM MM 2021 paper (oral), TalkNet, an active speaker detection model to detect 'whether the face in the screen is speaking or not?'. [[Paper](https://arxiv.org/pdf/2107.06592.pdf)] [[Video_English](https://youtu.be/C6bpAgI9zxE)] [[Video_Chinese](https://www.bilibili.com/video/bv1Yw411d7HG)]. + +### Updates: + +A new [demo page](https://www.sievedata.com/functions/sieve/talknet-asd). Thanks the contribution from [mvoodarla](https://github.com/mvoodarla) ! + +![overall.png](utils/overall.png) + +- [**Awesome ASD**](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/awesomeASD.md): Papers about active speaker detection in last years. + +- **TalkNet in AVA-Activespeaker dataset**: The code to preprocess the AVA-ActiveSpeaker dataset, train TalkNet in AVA train set and evaluate it in AVA val/test set. + +- **TalkNet in TalkSet and Columbia ASD dataset**: The code to generate TalkSet, an ASD dataset in the wild, based on VoxCeleb2 and LRS3, train TalkNet in TalkSet and evaluate it in Columnbia ASD dataset. + +- **An ASD Demo with pretrained TalkNet model**: An end-to-end script to detect and mark the speaking face by the pretrained TalkNet model. + +*** + +### Dependencies + +Start from building the environment +``` +conda create -n TalkNet python=3.7.9 anaconda +conda activate TalkNet +pip install -r requirement.txt +``` + +Start from the existing environment +``` +pip install -r requirement.txt +``` + +*** + +## TalkNet in AVA-Activespeaker dataset + +#### Data preparation + +The following script can be used to download and prepare the AVA dataset for training. + +``` +python trainTalkNet.py --dataPathAVA AVADataPath --download +``` + +`AVADataPath` is the folder you want to save the AVA dataset and its preprocessing outputs, the details can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/utils/tools.py#L34) . Please read them carefully. + +#### Training +Then you can train TalkNet in AVA end-to-end by using: +``` +python trainTalkNet.py --dataPathAVA AVADataPath +``` +`exps/exps1/score.txt`: output score file, `exps/exp1/model/model_00xx.model`: trained model, `exps/exps1/val_res.csv`: prediction for val set. + +#### Pretrained model +Our pretrained model performs `mAP: 92.3` in validation set, you can check it by using: +``` +python trainTalkNet.py --dataPathAVA AVADataPath --evaluation +``` +The pretrained model will automaticly be downloaded into `TalkNet_ASD/pretrain_AVA.model`. It performs `mAP: 90.8` in the testing set. + +*** + +## TalkNet in TalkSet and Columbia ASD dataset + +#### Data preparation + +We find that it is challenge to apply the model we trained in AVA for the videos not in AVA (Reason is [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/FAQ.md), Q3.1). So we build TalkSet, an active speaker detection dataset in the wild, based on `VoxCeleb2` and `LRS3`. + +We do not plan to upload this dataset since we just modify it, instead of building it. In `TalkSet` folder we provide these `.txt` files to describe which files we used to generate the TalkSet and their ASD labels. You can generate this `TalkSet` if you are interested to train an ASD model in the wild. + +Also, we have provided our pretrained TalkNet model in TalkSet. You can evaluate it in Columbia ASD dataset or other raw videos in the wild. + +#### Usage + +A pretrain model in TalkSet will be download into `TalkNet_ASD/pretrain_TalkSet.model` when using the following script: + +``` +python demoTalkNet.py --evalCol --colSavePath colDataPath +``` + +Also, Columnbia ASD dataset and the labels will be downloaded into `colDataPath`. Finally you can get the following F1 result. + +| Name | Bell | Boll | Lieb | Long | Sick | Avg. | +|----- | ------ | ------ | ------ | ------ | ------ | ------ | +| F1 | 98.1 | 88.8 | 98.7 | 98.0 | 97.7 | 96.3 | + +(This result is different from that in our paper because we train the model again, while the avg. F1 is very similar) +*** + +## An ASD Demo with pretrained TalkNet model + +#### Data preparation + +We build an end-to-end script to detect and extract the active speaker from the raw video by our pretrain model in TalkSet. + +You can put the raw video (`.mp4` and `.avi` are both fine) into the `demo` folder, such as `001.mp4`. + +#### Usage + +``` +python demoTalkNet.py --videoName 001 +``` + +A pretrain model in TalkSet will be downloaded into `TalkNet_ASD/pretrain_TalkSet.model`. The structure of the output reults can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/demoTalkNet.py#L351). + +You can get the output video `demo/001/pyavi/video_out.avi`, which has marked the active speaker by green box and non-active speaker by red box. + +If you want to evaluate by using cpu only, you can modify `demoTalkNet.py` and `talkNet.py` file: modify all `cuda` into `cpu`. Then replace line 83 in talkNet.py into `loadedState = torch.load(path,map_location=torch.device('cpu'))` + +*** + +### Citation + +Please cite the following if our paper or code is helpful to your research. +``` +@inproceedings{tao2021someone, + title={Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual Active Speaker Detection}, + author={Tao, Ruijie and Pan, Zexu and Das, Rohan Kumar and Qian, Xinyuan and Shou, Mike Zheng and Li, Haizhou}, + booktitle = {Proceedings of the 29th ACM International Conference on Multimedia}, + pages = {3927–3935}, + year={2021} +} +``` +I have summaried some potential [FAQs](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/FAQ.md). You can also check the `issues` in Github for other questions that I have answered. + +This is my first open-source work, please let me know if I can future improve in this repositories or there is anything wrong in our work. Thanks for your support! + +### Acknowledge + +We study many useful projects in our codeing process, which includes: + +The structure of the project layout and the audio encoder is learnt from this [repository](https://github.com/clovaai/voxceleb_trainer). + +Demo for visulization is modified from this [repository](https://github.com/joonson/syncnet_python). + +AVA data download code is learnt from this [repository](https://github.com/fuankarion/active-speakers-context). + +The model for the visual frontend is learnt from this [repository](https://github.com/lordmartian/deep_avsr). + +Thanks for these authors to open source their code! + +### Cooperation + +If you are interested to work on this topic and have some ideas to implement, I am glad to collaborate and contribute with my experiences & knowlegde in this topic. Please contact me with ruijie.tao@u.nus.edu. diff --git a/talknet-asd/TalkSet/README.md b/talknet-asd/TalkSet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ab2a543947c573a5f952fbc36cc45cde5bab0716 --- /dev/null +++ b/talknet-asd/TalkSet/README.md @@ -0,0 +1,48 @@ +### TalkSet Generation + +You can check the 'train.txt' and 'test.txt' to generate TalkSet by your own. + +This `generate_TalkSet.py` code is just used for your reference, I did not check it recently. + +Input the LRS3, VoxCeleb2, 3 list files in `lists_in` +Output TalkSet, train.txt, test.txt (Here the test set is the validation set actually) + +### Usage: + +Set the following parser based on the location of your data: + +`out_path`: the output TalkSet location +`Vox_audio`: Location of the Vox2, training set, audio location +`Vox_video`: Location of the Vox2, training set, video location +`lrs3_audio`: Location of the LRS3, audio location +`lrs3_video`: Location of the LRS3, video location +`task`: The part of the TalkSet you want to generate, eg: TAudio +`num_cpu`: The num of the threads, higher will be faster, based on your PC performance, eg: 10 + +``` +python TalkSet/generate_TalkSet.py --task 'TAudio' +python TalkSet/generate_TalkSet.py --task 'FAudio' +python TalkSet/generate_TalkSet.py --task 'TFAudio' +python TalkSet/generate_TalkSet.py --task 'TSilence' +python TalkSet/generate_TalkSet.py --task 'FSilence' +python TalkSet/generate_TalkSet.py --task 'Fusion' +``` + +### Description: +For `lists_out\*.txt` files: +- The 1st row is the face clips data type, + - TAudio: audio is active, lip is moving, audio and lip are sync + - FAudio: audio is active, lip is moving, audio and lip are not sync (Speech from others) + - TFAudio: one part is 'TAudio', the other part is 'FAudio' + - TSilence: one part is 'TAudio', in the other part, audio is non-active, lip is not moving + - FSilence: one part is 'silence'(audio is non-active, lip is not moving), in the other part, audio is active, lip is not moving (Speech from others) +- The 2nd row is the path for the audio file (filename started from 'silence' is the data from LRS3, filename started from 'id.....' is the data from VoxCeleb2) +- The 3rd row is the path for the video file +- The 4th row is the length(seconds) of this data +- The 5th row is the start of 'active' clip (in FSilence, it presents the 'silence' part) +- The 6th row is the end of 'active' clip +- The 7th row is the start of 'non-active' clip (in FSilence, it presents the 'speech from others' part) +- The 8th row is the end of 'non-active' clip +- The 9th row is the file ID + +The dataset generated will not be fixed each time because we randomly select FSlience data, and the change point is the random number. We believe the result will be similar. The whole time to generate the TalkSet will use about 3 to 6 hours in our experiments. \ No newline at end of file diff --git a/talknet-asd/TalkSet/generate_TalkSet.py b/talknet-asd/TalkSet/generate_TalkSet.py new file mode 100644 index 0000000000000000000000000000000000000000..e06d9ab74a0cb9b3a089c6440fac8ff62b598b66 --- /dev/null +++ b/talknet-asd/TalkSet/generate_TalkSet.py @@ -0,0 +1,391 @@ +import os, glob, subprocess, argparse, sys, numpy, random, math, cv2 +from itertools import repeat +from multiprocessing import Pool +from scipy.io import wavfile +from pydub import AudioSegment +from tqdm import tqdm + +def get_length(input_video): + result = subprocess.run(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', input_video], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return float(result.stdout) + +def read_Vox_lines(file): + Tlines, Flines = [], [] + with open(file) as f_in: + while True: + line = f_in.readline() + if not line: + break + if int(line[0]): + Tlines.append(line) + else: + Flines.append(line) + return Tlines, Flines + +def read_LRS3_ST(file): + lines = [] + with open(file) as f_in: + while True: + line = f_in.readline() + if not line: + break + lines.append(line) + return lines[:30000] + +def read_LRS3_S(file): + lines = [] + with open(file) as f_in: + while True: + line = f_in.readline() + if not line: + break + start = int(line.split()[1]) / 100 + end = int(line.split()[2]) / 100 + if end - start <= 3: # Only select less than 3s + lines.append(line) + return lines[:30000] + +def generate_TAudio(line, args): + # Get the id of the audio and video + audio_name = line.split()[1][:-4] + video_name = line.split()[2][:-4] + id1 = audio_name.split('/')[0] + name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2] + name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2] + name = name1 + '_' + name2 + audio_path = os.path.join(args.Vox_audio, audio_name + '.wav') + video_path = os.path.join(args.Vox_video, video_name + '.mp4') + out_audio_path = os.path.join(args.out_path, 'TAudio', id1 + '/' + name + '.wav') + out_video_path = os.path.join(args.out_path, 'TAudio', id1 + '/' + name + '.mp4') + os.makedirs(os.path.join(args.out_path, 'TAudio', id1), exist_ok = True) + + # Read the audio data and the length of audio and video + audio = AudioSegment.from_file(audio_path, format="wav") + length_audio = len(audio) / 1000.0 + length_video = get_length(video_path) + length_data = int(min(length_video, length_audio) * 100) / 100 + audio = audio[:int(length_data * 1000)] + + # Extract the video and audio + start = 0 + end = length_data + audio.export(out_audio_path, format="wav") + cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path) + subprocess.call(cmd, shell=True, stdout=None) + + # # Write the txt file + start_T, end_T = 0, length_data + start_F, end_F= 0, 0 + line_new = "TAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \ + + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n' + return line_new + +def generate_FAudio(line, args): + # Get the id of the audio and video + audio_name = line.split()[1][:-4] + video_name = line.split()[2][:-4] + id1 = audio_name.split('/')[0] + name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2] + name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2] + name = name1 + '_' + name2 + audio_path = os.path.join(args.Vox_audio, audio_name + '.wav') + video_path = os.path.join(args.Vox_video, video_name + '.mp4') + out_audio_path = os.path.join(args.out_path, 'FAudio', id1 + '/' + name + '.wav') + out_video_path = os.path.join(args.out_path, 'FAudio', id1 + '/' + name + '.mp4') + os.makedirs(os.path.join(args.out_path, 'FAudio', id1), exist_ok = True) + + # Read the audio data and the length of audio and video + audio = AudioSegment.from_file(audio_path, format="wav") + length_audio = len(audio) / 1000.0 + length_video = get_length(video_path) + length_data = int(min(length_video, length_audio) * 100) / 100 + audio = audio[:int(length_data * 1000)] + + # Extract the video and audio + start = 0 + end = length_data + audio.export(out_audio_path, format="wav") + cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path) + subprocess.call(cmd, shell=True, stdout=None) + + # Write the txt file + start_T, end_T = 0, 0 + start_F, end_F= 0, length_data + line_new = "FAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \ + + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n' + return line_new + +def generate_TFAudio(line, args): + # Get the id of the audio and video + audio_name = line.split()[1][:-4] + video_name = line.split()[2][:-4] + id1 = audio_name.split('/')[0] + name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + audio_name.split('/')[2] + name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + video_name.split('/')[2] + name = name1 + '_' + name2 + audio_T_path = os.path.join(args.Vox_audio, video_name + '.wav') + audio_F_path = os.path.join(args.Vox_audio, audio_name + '.wav') + video_path = os.path.join(args.Vox_video, video_name + '.mp4') + out_audio_path = os.path.join(args.out_path, 'TFAudio', id1 + '/' + name + '.wav') + out_video_path = os.path.join(args.out_path, 'TFAudio', id1 + '/' + name + '.mp4') + os.makedirs(os.path.join(args.out_path, 'TFAudio', id1), exist_ok = True) + + # Read the audio data and the length of audio and video + audio_T = AudioSegment.from_file(audio_T_path, format="wav") + audio_F = AudioSegment.from_file(audio_F_path, format="wav") + length_audio_T = len(audio_T) / 1000.0 + length_audio_F = len(audio_F) / 1000.0 + length_video = get_length(video_path) + length_data = int(min(length_audio_T, length_audio_F, length_video) * 100) / 100 + audio_T = audio_T[:int(length_data * 1000)] + audio_F = audio_F[:int(length_data * 1000)] + + # Generate the audio + changepoint = int((length_data * 0.25 + length_data * random.random() * 0.5) * 100) / 100 + audio_dict = {} + audio_dict['T1'] = audio_T[:changepoint * 1000] + audio_dict['T2'] = audio_T[changepoint * 1000:] + audio_dict['F1'] = audio_F[:changepoint * 1000] + audio_dict['F2'] = audio_F[changepoint * 1000:] + seed = random.randint(0,1) + if seed == 1: + audio = audio_dict['T1'] + audio_dict['F2'] + else: + audio = audio_dict['F1'] + audio_dict['T2'] + # Extract the video and audio + start = 0 + end = length_data + audio.export(out_audio_path, format="wav") + cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path) + subprocess.call(cmd, shell=True, stdout=None) + + # Write the txt file + if seed == 1: + start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data + elif seed == 0: + start_F, end_F, start_T, end_T = 0, changepoint, changepoint, length_data + line_new = "TFAudio" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \ + + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n' + return line_new + +def generate_TSilence(line, args): + # Get the id of the audio and video + type_change = line.split()[0] + audio_name = line.split()[1] + video_name = line.split()[1] + id1 = audio_name.split('/')[0] + name1 = audio_name.split('/')[0] + '_' + audio_name.split('/')[1] + '_' + line.split()[5] + name2 = video_name.split('/')[0] + '_' + video_name.split('/')[1] + '_' + line.split()[5] + name = name1 + '_' + name2 + start = int(line.split()[2]) / 100 + mid = int(line.split()[3]) / 100 + end = int(line.split()[4]) / 100 + audio_path = os.path.join(args.lrs3_audio, 'pretrain', audio_name[8:] + '.wav') + video_path = os.path.join(args.lrs3_video, 'pretrain', video_name[8:]+ '.mp4') + out_audio_path = os.path.join(args.out_path, 'TSilence', id1 + '/' + name + '.wav') + out_video_path = os.path.join(args.out_path, 'TSilence', id1 + '/' + name + '.mp4') + os.makedirs(os.path.join(os.path.join(args.out_path, 'TSilence'), id1), exist_ok = True) + + # Read the audio data and the length of audio and video + audio = AudioSegment.from_file(audio_path, format="wav") + + # Get the required audio and video data + length_data = int((end - start) * 100) / 100 + audio = audio[int(start * 1000):int(end * 1000)] + audio.export(out_audio_path, format="wav") + cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path) + subprocess.call(cmd, shell=True, stdout=None) + + changepoint = int((mid - start) * 100) / 100 + if type_change == "10": + start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data + elif type_change == "01": + start_T, end_T, start_F, end_F = changepoint, length_data, 0, changepoint + + audio_name = audio_name[:-5] + line.split()[5] + video_name = video_name[:-5] + line.split()[5] + line_new = "TSilence" + ' ' + str(audio_name) + ' ' + str(video_name) + ' ' + str(length_data) \ + + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n' + return line_new + +def generate_FSilence(line, Flines, args): + # Get the id of the audio and video + audio_T_name = line.split()[0] + video_name = line.split()[0] + start = int(line.split()[1]) / 100 + end = int(line.split()[2]) / 100 + length_data = int((end - start) * 100) / 100 + changepoint = int((length_data * 0.25 + length_data * random.random() * 0.5) * 100) / 100 + speech_line = random.choice(Flines) + length_speech = float(speech_line.split()[-1]) + while length_speech < length_data: + speech_line = random.choice(Flines) + length_speech = float(speech_line.split()[-1]) + audio_F_name = speech_line.split()[1][:-4] + id1 = audio_F_name.split('/')[0] + name1 = audio_F_name.split('/')[0] + '_' + audio_F_name.split('/')[1] + '_' + audio_F_name.split('/')[2] + name2 = audio_T_name.split('/')[0] + '_' + audio_T_name.split('/')[1] + '_' + line.split()[-1] + name = name1 + '_' + name2 + + # True: orig_video False: speech+slience + video_path = os.path.join(args.lrs3_video, 'pretrain', video_name[8:]+ '.mp4') + audio_T_path = os.path.join(args.lrs3_audio, 'pretrain', audio_T_name[8:] + '.wav') + audio_F_path = os.path.join(args.Vox_audio, audio_F_name + '.wav') + out_audio_path = os.path.join(args.out_path, 'FSilence', id1 + '/' + name + '.wav') + out_video_path = os.path.join(args.out_path, 'FSilence', id1 + '/' + name + '.mp4') + os.makedirs(os.path.join(args.out_path, 'FSilence', id1), exist_ok = True) + + # Read the audio data and the length of audio and video + audio_T = AudioSegment.from_file(audio_T_path, format="wav") + audio_T = audio_T[int(start * 1000):int(end * 1000)] + audio_F = AudioSegment.from_file(audio_F_path, format="wav") + length_audio_T = len(audio_T) / 1000.0 + length_audio_F = len(audio_F) / 1000.0 + length_video = get_length(video_path) + length_data = int(min(length_audio_T, length_audio_F, length_video) * 100) / 100 + audio_T = audio_T[:int(length_data * 1000)] + audio_F = audio_F[:int(length_data * 1000)] + + # Generate the audio + audio_dict = {} + audio_dict['T1'] = audio_T[:changepoint * 1000] + audio_dict['T2'] = audio_T[changepoint * 1000:] + audio_dict['F1'] = audio_F[:changepoint * 1000] + audio_dict['F2'] = audio_F[changepoint * 1000:] + seed = random.randint(0,1) + if seed == 1: + audio = audio_dict['T1'] + audio_dict['F2'] + else: + audio = audio_dict['F1'] + audio_dict['T2'] + # Extract the video and audio + audio.export(out_audio_path, format="wav") + cmd = "ffmpeg -y -ss %.3f -t %.3f -i %s -i %s -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest %s -loglevel panic"% (start, end - start, video_path, out_audio_path, out_video_path) + subprocess.call(cmd, shell=True, stdout=None) + + # Write the txt file + if seed == 1: + start_T, end_T, start_F, end_F = 0, changepoint, changepoint, length_data + elif seed == 0: + start_F, end_F, start_T, end_T = 0, changepoint, changepoint, length_data + + video_name = video_name[:-5] + line.split()[-1] + line_new = "FSilence" + ' ' + str(audio_F_name) + ' ' + str(video_name) + ' ' + str(length_data) \ + + ' ' + str(start_T) + ' ' + str(end_T) + ' ' + str(start_F) + ' ' + str(end_F) + '\n' + return line_new + +# MAIN +parser = argparse.ArgumentParser(description = "generate_Dataset") + +parser.add_argument('--List_folder', type=str, default= 'lists') +parser.add_argument('--out_path', type=str, default= '/data07/ruijie/database/TalkSet_final') +parser.add_argument('--Vox_audio', type=str, default= '/home/ruijie/database/VoxCeleb2/audio/audio_clean/clean/train') +parser.add_argument('--Vox_video', type=str, default= '/home/ruijie/database/VoxCeleb2/video/orig/train') +parser.add_argument('--lrs3_audio', type=str, default='/data07/ruijie/database/LRS3/audio/orig_audio/clean') +parser.add_argument('--lrs3_video', type=str, default='/data07/ruijie/database/LRS3/video/orig_video') +parser.add_argument('--task', type=str, default='TAudio') +parser.add_argument('--num_cpu', type=int, default=10) +args = parser.parse_args() + +os.makedirs(os.path.join(args.out_path, 'TAudio'), exist_ok = True) +os.makedirs(os.path.join(args.out_path, 'FAudio'), exist_ok = True) +os.makedirs(os.path.join(args.out_path, 'TFAudio'), exist_ok = True) +os.makedirs(os.path.join(args.out_path, 'FSilence'), exist_ok = True) +os.makedirs(os.path.join(args.out_path, 'TSilence'), exist_ok = True) + +args.list_Vox = os.path.join(args.List_folder, 'lists_in', 'Vox_list.txt') +args.list_LRS3_S = os.path.join(args.List_folder, 'lists_in', 'LRS3_S_list.txt') +args.list_LRS3_ST = os.path.join(args.List_folder, 'lists_in', 'LRS3_ST_list.txt') +args.list_out = os.path.join(args.List_folder, 'lists_out') +args.list_out_train = os.path.join(args.list_out, 'train.txt') +args.list_out_test = os.path.join(args.list_out, 'test.txt') + +if args.task == 'TAudio': + Tlines, _ = read_Vox_lines(args.list_Vox) + Tlines_new = [] + # Generate the video and audio + with Pool(args.num_cpu) as p: + Tlines_new.append(p.starmap(generate_TAudio, zip(Tlines, repeat(args)))) + # Write the txt file + out_Tlist_file = open(os.path.join(args.list_out, 'TAudio.txt'), "w") + for line_new in Tlines_new[0]: + out_Tlist_file.write(line_new) + print('TAudio Finish') + +if args.task == 'FAudio': + _, Flines = read_Vox_lines(args.list_Vox) + Flines_new = [] + # Generate the video and audio + with Pool(args.num_cpu) as p: + Flines_new.append(p.starmap(generate_FAudio, zip(Flines, repeat(args)))) + + # Write the txt file + out_Flist_file = open(os.path.join(args.list_out, 'FAudio.txt'), "w") + for line_new in Flines_new[0]: + out_Flist_file.write(line_new) + print('FAudio Finish') + +if args.task == 'TFAudio': + _, Flines = read_Vox_lines(args.list_Vox) + TFlines_new = [] + # Generate the video and audio + with Pool(args.num_cpu) as p: + TFlines_new.append(p.starmap(generate_TFAudio, zip(Flines, repeat(args)))) + + # Write the txt file + out_TFlist_file = open(os.path.join(args.list_out, 'TFAudio.txt'), "w") + for line_new in TFlines_new[0]: + out_TFlist_file.write(line_new) + print('TFAudio Finish') + +if args.task == 'TSilence': + Slines = read_LRS3_ST(args.list_LRS3_ST) + TSlines_new = [] + with Pool(args.num_cpu) as p: + TSlines_new.append(p.starmap(generate_TSilence, zip(Slines, repeat(args)))) + + # Write the txt file + out_TSlist_file = open(os.path.join(args.list_out, 'TSilence.txt'), "w") + for line_new in TSlines_new[0]: + out_TSlist_file.write(line_new) + print('TSilence Finish') + +if args.task == 'FSilence': + Tlines, _ = read_Vox_lines(args.list_Vox) + Slines = read_LRS3_S(args.list_LRS3_S) + FSlines_new = [] + with Pool(args.num_cpu) as p: + FSlines_new.append(p.starmap(generate_FSilence, zip(Slines, repeat(Tlines), repeat(args)))) + + out_FSlist_file = open(os.path.join(args.list_out, 'FSilence.txt'), "w") + for line_new in FSlines_new[0]: + out_FSlist_file.write(line_new) + print('FSilence Finish') + +if args.task == 'Fusion': + lines = [] + for name in {'TAudio', 'FAudio', 'TFAudio', 'TSilence', 'FSilence'}: + with open(args.list_out + '/' + name + '.txt') as f: + while True: + line = f.readline() + if not line: + break + lines.append(line) + train_file = open(args.list_out_train, "w") + test_file = open(args.list_out_test, "w") + random.shuffle(lines) + for num, line in enumerate(lines): + data = line.split() + if float(data[3]) > 6: # For the data longer than 6s, we cut them into 6s, so that will make training process simple. + line = str(data[0]) + ' ' + str(data[1]) + ' ' + str(data[2]) + ' ' + \ + str(min(float(data[3]), 6)) + ' ' + str(min(float(data[4]), 6)) + ' ' + \ + str(min(float(data[5]), 6)) + ' ' + str(min(float(data[6]), 6)) + ' ' + \ + str(min(float(data[7]), 6)) + ' ' + "%06d"%int(num) + '\n' + else: + line = str(data[0]) + ' ' + str(data[1]) + ' ' + str(data[2]) + ' ' + \ + str(data[3]) + ' ' + str(data[4]) + ' ' + \ + str(data[5]) + ' ' + str(data[6]) + ' ' + \ + str(data[7]) + ' ' + "%06d"%int(num) + '\n' + if num % 30000 < 27000: + train_file.write(line) + else: + test_file.write(line) \ No newline at end of file diff --git a/talknet-asd/awesomeASD.md b/talknet-asd/awesomeASD.md new file mode 100644 index 0000000000000000000000000000000000000000..8661791e70978f560602916c4756878f51703d08 --- /dev/null +++ b/talknet-asd/awesomeASD.md @@ -0,0 +1,38 @@ +## Related Work for Active Speaker Detection + +--- +### Research Paper In **AVA-ActiveSpeaker Dataset** + +- Roth J, Chaudhuri S, Klejch O, et al. Ava active speaker: [An audio-visual dataset for active speaker detection](https://arxiv.org/pdf/1901.01342.pdf), ICASSP, 2020. +- Sharma R, Somandepalli K, Narayanan S. [Crossmodal learning for audio-visual speech event localization](https://arxiv.org/pdf/2003.04358.pdf), arXiv preprint, 2020. +- Alcázar J L, Caba F, Mai L, et al. [Active speakers in context](https://openaccess.thecvf.com/content_CVPR_2020/papers/Alcazar_Active_Speakers_in_Context_CVPR_2020_paper.pdf) , CVPR, 2020. +- León-Alcázar J, Heilbron F C, Thabet A, et al. [MAAS: Multi-modal Assignation for Active Speaker Detection](https://arxiv.org/pdf/2101.03682.pdf), arXiv preprint, 2021. +- Huang C, Koishida K. [Improved Active Speaker Detection based on Optical Flow](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w56/Huang_Improved_Active_Speaker_Detection_Based_on_Optical_Flow_CVPRW_2020_paper.pdf), CVPR Workshops, 2020 +- Assunção G, Gonçalves N, Menezes P. [Bio-Inspired Modality Fusion for Active Speaker Detection](https://www.mdpi.com/2076-3417/11/8/3397/pdf), Applied Sciences, 2021 +- Pouthier B, Pilati L, Gudupudi L K, et al. [Active Speaker Detection as a Multi-Objective Optimization with Uncertainty-based Multimodal Fusion](https://arxiv.org/pdf/2106.03821.pdf), arXiv preprint, 2021 +- Köpüklü O, Taseska M, Rigoll G. [How to Design a Three-Stage Architecture for Audio-Visual Active Speaker Detection in the Wild](https://arxiv.org/pdf/2106.03932.pdf), arVix preprint, 2021 +- Ruijie Tao, Zexu Pan, Rohan Kumar Das, Xinyuan Qian, Mike Zheng Shou, Haizhou Li. [Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual Active Speaker Detection](https://arxiv.org/pdf/2107.06592.pdf), ACM Multimedia (MM), 2021 +- Yuanhang Zhang, Susan Liang, Shuang Yang, Xiao Liu, Zhongqin Wu, Shiguang Shan, Xilin Chen. [UniCon: Unified Context Network for Robust Active Speaker +Detection](https://arxiv.org/pdf/2108.02607.pdf), ACM Multimedia (MM), 2021 + + +### Research Report In **AVA-ActiveSpeaker Dataset for AVA-Activity Challenge** +- Chung J S. [Naver at ActivityNet Challenge 2019--Task B Active Speaker Detection (AVA)](https://arxiv.org/pdf/1906.10555.pdf), 2019. +- Zhang Y H, Xiao J, Yang S, et al. [Multi-Task Learning for Audio-Visual Active Speaker Detection](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2019/Multi_Task_Learning_for_Audio_Visual_Active_Speaker_Detection.pdf), 2019 +- Alcázar J L, Caba F, Mai L, et al. [Universidad de los Andes at ActivityNet Challenge 2020 - Task B Active Speaker +Detection (AVA)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2020/ASC_AN_report.pdf), 2020 +- Köpüklü O, Taseska M, Rigoll G. [ASDNet at ActivityNet Challenge 2021-Active Speaker Detection (AVA)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S2_ActivityNet_Report_ASDNet.pdf), 2021 +- Zhang Y, Liang S, Yang S, et al. [ICTCAS-UCAS-TAL Submission to the AVA-ActiveSpeaker Task at ActivityNet Challenge 2021](http://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S1_ICTCAS-UCAS-TAL.pdf), 2021 +- Tao R, Pan Z, Das R K, et al. [NUS-HLT Report for ActivityNet Challenge 2021 AVA (Speaker)](https://static.googleusercontent.com/media/research.google.com/zh-CN//ava/2021/S3_NUS_Report_AVA_ActiveSpeaker_2021.pdf), 2021 + +### Research Paper In **Columnbia Active Speaker Detection Dataset** +- Chakravarty P, Tuytelaars T. [Cross-modal supervision for learning active speaker detection in video](https://arxiv.org/pdf/1603.08907.pdf), ECCV, 2016 +- Chung J S, Zisserman A. [Out of time: automated lip sync in the wild](https://www.robots.ox.ac.uk/~vgg/publications/2016/Chung16a/chung16a.pdf), ECCV, 2016 +- Shahid M, Beyan C, Murino V. [Voice activity detection by upper body motion analysis and unsupervised domain adaptation](https://openaccess.thecvf.com/content_ICCVW_2019/papers/HBU/Shahid_Voice_Activity_Detection_by_Upper_Body_Motion_Analysis_and_Unsupervised_ICCVW_2019_paper.pdf), ICCV Workshops, 2019 +- Afouras T, Owens A, Chung J S, et al. [Self-supervised learning of audio-visual objects from video](https://arxiv.org/pdf/2008.04237.pdf), ECCV, 2020 +- Shahid M, Beyan C, Murino V. [Comparisons of visual activity primitives for voice activity detection](https://www.researchgate.net/profile/Cigdem-Beyan/publication/335604556_Comparisons_of_Visual_Activity_Primitives_for_Voice_Activity_Detection/links/5fa19074a6fdccfd7b97c0f5/Comparisons-of-Visual-Activity-Primitives-for-Voice-Activity-Detection.pdf), ICIAP, 2019 +- Shahid M, Beyan C, Murino V. [S-VVAD: Visual Voice Activity Detection by Motion](https://www.researchgate.net/profile/Cigdem-Beyan/publication/348279893_S-VVAD_Visual_Voice_Activity_Detection_by_Motion_Segmentation/links/5ff60482299bf14088786cc1/S-VVAD-Visual-Voice-Activity-Detection-by-Motion-Segmentation.pdf), WACV, 2021 +- Beyan C, Shahid M, Murino V. [RealVAD: A real-world dataset and a method for voice activity detection by body motion analysis](https://ieeexplore.ieee.org/document/9133504), IEEE Transactions on Multimedia, 2020. + +### Other Paper for Active Speaker Detection +- Kim You Jin and Heo Hee-Soo, Soyeon Choe, et al. [Look Who’s Talking: Active Speaker Detection in the Wild](https://arxiv.org/pdf/2108.07640.pdf), Interspeech, 2021 diff --git a/talknet-asd/cog.yaml b/talknet-asd/cog.yaml new file mode 100644 index 0000000000000000000000000000000000000000..33093abd948f3972453564bc7b607fd618c6415c --- /dev/null +++ b/talknet-asd/cog.yaml @@ -0,0 +1,40 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + # set to true if your model requires a GPU + gpu: true + + # a list of ubuntu apt packages to install + system_packages: + - "libgl1-mesa-glx" + - "ffmpeg" + # - "libglib2.0-0" + + # python version in the form '3.11' or '3.11.4' + python_version: "3.8" + + # a list of packages in the format == + python_packages: + - "torch>=1.6.0" + - "torchaudio>=0.6.0" + - "numpy" + - "scipy" + - "scikit-learn" + - "tqdm" + - "scenedetect" + - "opencv-python" + - "python_speech_features" + - "torchvision" + - "ffmpeg" + - "gdown" + - "youtube-dl" + - "pandas" + + # commands run after the environment is setup + # run: + # - "echo env is ready!" + # - "echo another command if needed" + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/talknet-asd/dataLoader.py b/talknet-asd/dataLoader.py new file mode 100644 index 0000000000000000000000000000000000000000..c2bc2e98a3425ff4fc00bedfd67189dc019ec0c7 --- /dev/null +++ b/talknet-asd/dataLoader.py @@ -0,0 +1,143 @@ +import os, torch, numpy, cv2, random, glob, python_speech_features +from scipy.io import wavfile +from torchvision.transforms import RandomCrop + +def generate_audio_set(dataPath, batchList): + audioSet = {} + for line in batchList: + data = line.split('\t') + videoName = data[0][:11] + dataName = data[0] + _, audio = wavfile.read(os.path.join(dataPath, videoName, dataName + '.wav')) + audioSet[dataName] = audio + return audioSet + +def overlap(dataName, audio, audioSet): + noiseName = random.sample(set(list(audioSet.keys())) - {dataName}, 1)[0] + noiseAudio = audioSet[noiseName] + snr = [random.uniform(-5, 5)] + if len(noiseAudio) < len(audio): + shortage = len(audio) - len(noiseAudio) + noiseAudio = numpy.pad(noiseAudio, (0, shortage), 'wrap') + else: + noiseAudio = noiseAudio[:len(audio)] + noiseDB = 10 * numpy.log10(numpy.mean(abs(noiseAudio ** 2)) + 1e-4) + cleanDB = 10 * numpy.log10(numpy.mean(abs(audio ** 2)) + 1e-4) + noiseAudio = numpy.sqrt(10 ** ((cleanDB - noiseDB - snr) / 10)) * noiseAudio + audio = audio + noiseAudio + return audio.astype(numpy.int16) + +def load_audio(data, dataPath, numFrames, audioAug, audioSet = None): + dataName = data[0] + fps = float(data[2]) + audio = audioSet[dataName] + if audioAug == True: + augType = random.randint(0,1) + if augType == 1: + audio = overlap(dataName, audio, audioSet) + else: + audio = audio + # fps is not always 25, in order to align the visual, we modify the window and step in MFCC extraction process based on fps + audio = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025 * 25 / fps, winstep = 0.010 * 25 / fps) + maxAudio = int(numFrames * 4) + if audio.shape[0] < maxAudio: + shortage = maxAudio - audio.shape[0] + audio = numpy.pad(audio, ((0, shortage), (0,0)), 'wrap') + audio = audio[:int(round(numFrames * 4)),:] + return audio + +def load_visual(data, dataPath, numFrames, visualAug): + dataName = data[0] + videoName = data[0][:11] + faceFolderPath = os.path.join(dataPath, videoName, dataName) + faceFiles = glob.glob("%s/*.jpg"%faceFolderPath) + sortedFaceFiles = sorted(faceFiles, key=lambda data: (float(data.split('/')[-1][:-4])), reverse=False) + faces = [] + H = 112 + if visualAug == True: + new = int(H*random.uniform(0.7, 1)) + x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new) + M = cv2.getRotationMatrix2D((H/2,H/2), random.uniform(-15, 15), 1) + augType = random.choice(['orig', 'flip', 'crop', 'rotate']) + else: + augType = 'orig' + for faceFile in sortedFaceFiles[:numFrames]: + face = cv2.imread(faceFile) + face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (H,H)) + if augType == 'orig': + faces.append(face) + elif augType == 'flip': + faces.append(cv2.flip(face, 1)) + elif augType == 'crop': + faces.append(cv2.resize(face[y:y+new, x:x+new] , (H,H))) + elif augType == 'rotate': + faces.append(cv2.warpAffine(face, M, (H,H))) + faces = numpy.array(faces) + return faces + + +def load_label(data, numFrames): + res = [] + labels = data[3].replace('[', '').replace(']', '') + labels = labels.split(',') + for label in labels: + res.append(int(label)) + res = numpy.array(res[:numFrames]) + return res + +class train_loader(object): + def __init__(self, trialFileName, audioPath, visualPath, batchSize, **kwargs): + self.audioPath = audioPath + self.visualPath = visualPath + self.miniBatch = [] + mixLst = open(trialFileName).read().splitlines() + # sort the training set by the length of the videos, shuffle them to make more videos in the same batch belong to different movies + sortedMixLst = sorted(mixLst, key=lambda data: (int(data.split('\t')[1]), int(data.split('\t')[-1])), reverse=True) + start = 0 + while True: + length = int(sortedMixLst[start].split('\t')[1]) + end = min(len(sortedMixLst), start + max(int(batchSize / length), 1)) + self.miniBatch.append(sortedMixLst[start:end]) + if end == len(sortedMixLst): + break + start = end + + def __getitem__(self, index): + batchList = self.miniBatch[index] + numFrames = int(batchList[-1].split('\t')[1]) + audioFeatures, visualFeatures, labels = [], [], [] + audioSet = generate_audio_set(self.audioPath, batchList) # load the audios in this batch to do augmentation + for line in batchList: + data = line.split('\t') + audioFeatures.append(load_audio(data, self.audioPath, numFrames, audioAug = True, audioSet = audioSet)) + visualFeatures.append(load_visual(data, self.visualPath,numFrames, visualAug = True)) + labels.append(load_label(data, numFrames)) + return torch.FloatTensor(numpy.array(audioFeatures)), \ + torch.FloatTensor(numpy.array(visualFeatures)), \ + torch.LongTensor(numpy.array(labels)) + + def __len__(self): + return len(self.miniBatch) + + +class val_loader(object): + def __init__(self, trialFileName, audioPath, visualPath, **kwargs): + self.audioPath = audioPath + self.visualPath = visualPath + self.miniBatch = open(trialFileName).read().splitlines() + + def __getitem__(self, index): + line = [self.miniBatch[index]] + numFrames = int(line[0].split('\t')[1]) + audioSet = generate_audio_set(self.audioPath, line) + data = line[0].split('\t') + audioFeatures = [load_audio(data, self.audioPath, numFrames, audioAug = False, audioSet = audioSet)] + visualFeatures = [load_visual(data, self.visualPath,numFrames, visualAug = False)] + labels = [load_label(data, numFrames)] + return torch.FloatTensor(numpy.array(audioFeatures)), \ + torch.FloatTensor(numpy.array(visualFeatures)), \ + torch.LongTensor(numpy.array(labels)) + + def __len__(self): + return len(self.miniBatch) diff --git a/talknet-asd/demoTalkNet.py b/talknet-asd/demoTalkNet.py new file mode 100644 index 0000000000000000000000000000000000000000..0660169fe869be1445359f51fdae5d1c8a3ca1c0 --- /dev/null +++ b/talknet-asd/demoTalkNet.py @@ -0,0 +1,686 @@ +import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features + +from scipy import signal +from shutil import rmtree +from scipy.io import wavfile +from scipy.interpolate import interp1d +from sklearn.metrics import accuracy_score, f1_score + +from scenedetect.video_manager import VideoManager +from scenedetect.scene_manager import SceneManager +from scenedetect.frame_timecode import FrameTimecode +from scenedetect.stats_manager import StatsManager +from scenedetect.detectors import ContentDetector + +from model.faceDetector.s3fd import S3FD +from talkNet import talkNet + +warnings.filterwarnings("ignore") + +parser = argparse.ArgumentParser(description="TalkNet Demo or Columnbia ASD Evaluation") + +parser.add_argument("--videoName", type=str, default="001", help="Demo video name") +parser.add_argument( + "--videoFolder", type=str, default="demo", help="Path for inputs, tmps and outputs" +) +parser.add_argument( + "--pretrainModel", + type=str, + default="pretrain_TalkSet.model", + help="Path for the pretrained TalkNet model", +) + +parser.add_argument( + "--nDataLoaderThread", type=int, default=10, help="Number of workers" +) +parser.add_argument( + "--facedetScale", + type=float, + default=0.25, + help="Scale factor for face detection, the frames will be scale to 0.25 orig", +) +parser.add_argument( + "--minTrack", type=int, default=10, help="Number of min frames for each shot" +) +parser.add_argument( + "--numFailedDet", + type=int, + default=10, + help="Number of missed detections allowed before tracking is stopped", +) +parser.add_argument( + "--minFaceSize", type=int, default=1, help="Minimum face size in pixels" +) +parser.add_argument("--cropScale", type=float, default=0.40, help="Scale bounding box") + +parser.add_argument("--start", type=int, default=0, help="The start time of the video") +parser.add_argument( + "--duration", + type=int, + default=0, + help="The duration of the video, when set as 0, will extract the whole video", +) + +parser.add_argument( + "--evalCol", + dest="evalCol", + action="store_true", + help="Evaluate on Columnbia dataset", +) +parser.add_argument( + "--colSavePath", + type=str, + default="/data08/col", + help="Path for inputs, tmps and outputs", +) + +args = parser.parse_args() + +if os.path.isfile(args.pretrainModel) == False: # Download the pretrained model + Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea" + cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel) + subprocess.call(cmd, shell=True, stdout=None) + +if args.evalCol == True: + # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using) + # 2. extract audio, extract video frames + # 3. scend detection, face detection and face tracking + # 4. active speaker detection for the detected face clips + # 5. use iou to find the identity of each face clips, compute the F1 results + # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour + # The step 4 and 5 need less than 10 minutes + # Need about 20G space finally + # ``` + args.videoName = "col" + args.videoFolder = args.colSavePath + args.savePath = os.path.join(args.videoFolder, args.videoName) + args.videoPath = os.path.join(args.videoFolder, args.videoName + ".mp4") + args.duration = 0 + if os.path.isfile(args.videoPath) == False: # Download video + link = "https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s" + cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link) + output = subprocess.call(cmd, shell=True, stdout=None) + if os.path.isdir(args.videoFolder + "/col_labels") == False: # Download label + link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv" + cmd = "gdown --id %s -O %s" % (link, args.videoFolder + "/col_labels.tar.gz") + subprocess.call(cmd, shell=True, stdout=None) + cmd = "tar -xzvf %s -C %s" % ( + args.videoFolder + "/col_labels.tar.gz", + args.videoFolder, + ) + subprocess.call(cmd, shell=True, stdout=None) + os.remove(args.videoFolder + "/col_labels.tar.gz") +else: + args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + ".*"))[0] + args.savePath = os.path.join(args.videoFolder, args.videoName) + + +def scene_detect(args): + # CPU: Scene detection, output is the list of each shot's time duration + videoManager = VideoManager([args.videoFilePath]) + statsManager = StatsManager() + sceneManager = SceneManager(statsManager) + sceneManager.add_detector(ContentDetector()) + baseTimecode = videoManager.get_base_timecode() + videoManager.set_downscale_factor() + videoManager.start() + sceneManager.detect_scenes(frame_source=videoManager) + sceneList = sceneManager.get_scene_list(baseTimecode) + savePath = os.path.join(args.pyworkPath, "scene.pckl") + if sceneList == []: + sceneList = [ + (videoManager.get_base_timecode(), videoManager.get_current_timecode()) + ] + with open(savePath, "wb") as fil: + pickle.dump(sceneList, fil) + sys.stderr.write( + "%s - scenes detected %d\n" % (args.videoFilePath, len(sceneList)) + ) + return sceneList + + +def inference_video(args): + # GPU: Face detection, output is the list contains the face location and score in this frame + DET = S3FD(device="cuda") + flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) + flist.sort() + dets = [] + for fidx, fname in enumerate(flist): + image = cv2.imread(fname) + imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale]) + dets.append([]) + for bbox in bboxes: + dets[-1].append( + {"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]} + ) # dets has the frames info, bbox info, conf info + sys.stderr.write( + "%s-%05d; %d dets\r" % (args.videoFilePath, fidx, len(dets[-1])) + ) + savePath = os.path.join(args.pyworkPath, "faces.pckl") + with open(savePath, "wb") as fil: + pickle.dump(dets, fil) + return dets + + +def bb_intersection_over_union(boxA, boxB, evalCol=False): + # CPU: IOU Function to calculate overlap between two image + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + interArea = max(0, xB - xA) * max(0, yB - yA) + boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) + boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) + if evalCol == True: + iou = interArea / float(boxAArea) + else: + iou = interArea / float(boxAArea + boxBArea - interArea) + return iou + + +def track_shot(args, sceneFaces): + # CPU: Face tracking + iouThres = 0.5 # Minimum IOU between consecutive face detections + tracks = [] + while True: + track = [] + for frameFaces in sceneFaces: + for face in frameFaces: + if track == []: + track.append(face) + frameFaces.remove(face) + elif face["frame"] - track[-1]["frame"] <= args.numFailedDet: + iou = bb_intersection_over_union(face["bbox"], track[-1]["bbox"]) + if iou > iouThres: + track.append(face) + frameFaces.remove(face) + continue + else: + break + if track == []: + break + elif len(track) > args.minTrack: + frameNum = numpy.array([f["frame"] for f in track]) + bboxes = numpy.array([numpy.array(f["bbox"]) for f in track]) + frameI = numpy.arange(frameNum[0], frameNum[-1] + 1) + bboxesI = [] + for ij in range(0, 4): + interpfn = interp1d(frameNum, bboxes[:, ij]) + bboxesI.append(interpfn(frameI)) + bboxesI = numpy.stack(bboxesI, axis=1) + if ( + max( + numpy.mean(bboxesI[:, 2] - bboxesI[:, 0]), + numpy.mean(bboxesI[:, 3] - bboxesI[:, 1]), + ) + > args.minFaceSize + ): + tracks.append({"frame": frameI, "bbox": bboxesI}) + return tracks + + +def crop_video(args, track, cropFile): + # CPU: crop the face clips + flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) # Read the frames + flist.sort() + vOut = cv2.VideoWriter( + cropFile + "t.avi", cv2.VideoWriter_fourcc(*"XVID"), 25, (224, 224) + ) # Write video + dets = {"x": [], "y": [], "s": []} + for det in track["bbox"]: # Read the tracks + dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2) + dets["y"].append((det[1] + det[3]) / 2) # crop center x + dets["x"].append((det[0] + det[2]) / 2) # crop center y + dets["s"] = signal.medfilt(dets["s"], kernel_size=13) # Smooth detections + dets["x"] = signal.medfilt(dets["x"], kernel_size=13) + dets["y"] = signal.medfilt(dets["y"], kernel_size=13) + for fidx, frame in enumerate(track["frame"]): + cs = args.cropScale + bs = dets["s"][fidx] # Detection box size + bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount + image = cv2.imread(flist[frame]) + frame = numpy.pad( + image, + ((bsi, bsi), (bsi, bsi), (0, 0)), + "constant", + constant_values=(110, 110), + ) + my = dets["y"][fidx] + bsi # BBox center Y + mx = dets["x"][fidx] + bsi # BBox center X + face = frame[ + int(my - bs) : int(my + bs * (1 + 2 * cs)), + int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs)), + ] + vOut.write(cv2.resize(face, (224, 224))) + audioTmp = cropFile + ".wav" + audioStart = (track["frame"][0]) / 25 + audioEnd = (track["frame"][-1] + 1) / 25 + vOut.release() + command = ( + "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" + % (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp) + ) + output = subprocess.call(command, shell=True, stdout=None) # Crop audio file + _, audio = wavfile.read(audioTmp) + command = ( + "ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" + % (cropFile, audioTmp, args.nDataLoaderThread, cropFile) + ) # Combine audio and video file + output = subprocess.call(command, shell=True, stdout=None) + os.remove(cropFile + "t.avi") + return {"track": track, "proc_track": dets} + + +def extract_MFCC(file, outPath): + # CPU: extract mfcc + sr, audio = wavfile.read(file) + mfcc = python_speech_features.mfcc(audio, sr) # (N_frames, 13) [1s = 100 frames] + featuresPath = os.path.join(outPath, file.split("/")[-1].replace(".wav", ".npy")) + numpy.save(featuresPath, mfcc) + + +def evaluate_network(files, args): + # GPU: active speaker detection by pretrained TalkNet + s = talkNet() + s.loadParameters(args.pretrainModel) + sys.stderr.write("Model %s loaded from previous state! \r\n" % args.pretrainModel) + s.eval() + allScores = [] + # durationSet = {1,2,4,6} # To make the result more reliable + durationSet = { + 1, + 1, + 1, + 2, + 2, + 2, + 3, + 3, + 4, + 5, + 6, + } # Use this line can get more reliable result + for file in tqdm.tqdm(files, total=len(files)): + fileName = os.path.splitext(file.split("/")[-1])[0] # Load audio and video + _, audio = wavfile.read(os.path.join(args.pycropPath, fileName + ".wav")) + audioFeature = python_speech_features.mfcc( + audio, 16000, numcep=13, winlen=0.025, winstep=0.010 + ) + video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + ".avi")) + videoFeature = [] + while video.isOpened(): + ret, frames = video.read() + if ret == True: + face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (224, 224)) + face = face[ + int(112 - (112 / 2)) : int(112 + (112 / 2)), + int(112 - (112 / 2)) : int(112 + (112 / 2)), + ] + videoFeature.append(face) + else: + break + video.release() + videoFeature = numpy.array(videoFeature) + length = min( + (audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, + videoFeature.shape[0] / 25, + ) + audioFeature = audioFeature[: int(round(length * 100)), :] + videoFeature = videoFeature[: int(round(length * 25)), :, :] + allScore = [] # Evaluation use TalkNet + for duration in durationSet: + batchSize = int(math.ceil(length / duration)) + scores = [] + with torch.no_grad(): + for i in range(batchSize): + inputA = ( + torch.FloatTensor( + audioFeature[ + i * duration * 100 : (i + 1) * duration * 100, : + ] + ) + .unsqueeze(0) + .cuda() + ) + inputV = ( + torch.FloatTensor( + videoFeature[ + i * duration * 25 : (i + 1) * duration * 25, :, : + ] + ) + .unsqueeze(0) + .cuda() + ) + embedA = s.model.forward_audio_frontend(inputA) + embedV = s.model.forward_visual_frontend(inputV) + embedA, embedV = s.model.forward_cross_attention(embedA, embedV) + out = s.model.forward_audio_visual_backend(embedA, embedV) + score = s.lossAV.forward(out, labels=None) + scores.extend(score) + allScore.append(scores) + allScore = numpy.round((numpy.mean(numpy.array(allScore), axis=0)), 1).astype( + float + ) + allScores.append(allScore) + return allScores + + +def visualization(tracks, scores, args): + # CPU: visulize the result for video format + flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) + flist.sort() + faces = [[] for i in range(len(flist))] + for tidx, track in enumerate(tracks): + score = scores[tidx] + for fidx, frame in enumerate(track["track"]["frame"].tolist()): + s = score[ + max(fidx - 2, 0) : min(fidx + 3, len(score) - 1) + ] # average smoothing + s = numpy.mean(s) + faces[frame].append( + { + "track": tidx, + "score": float(s), + "s": track["proc_track"]["s"][fidx], + "x": track["proc_track"]["x"][fidx], + "y": track["proc_track"]["y"][fidx], + } + ) + firstImage = cv2.imread(flist[0]) + fw = firstImage.shape[1] + fh = firstImage.shape[0] + vOut = cv2.VideoWriter( + os.path.join(args.pyaviPath, "video_only.avi"), + cv2.VideoWriter_fourcc(*"XVID"), + 25, + (fw, fh), + ) + colorDict = {0: 0, 1: 255} + for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)): + image = cv2.imread(fname) + for face in faces[fidx]: + clr = colorDict[int((face["score"] >= 0))] + txt = round(face["score"], 1) + cv2.rectangle( + image, + (int(face["x"] - face["s"]), int(face["y"] - face["s"])), + (int(face["x"] + face["s"]), int(face["y"] + face["s"])), + (0, clr, 255 - clr), + 10, + ) + cv2.putText( + image, + "%s" % (txt), + (int(face["x"] - face["s"]), int(face["y"] - face["s"])), + cv2.FONT_HERSHEY_SIMPLEX, + 1.5, + (0, clr, 255 - clr), + 5, + ) + vOut.write(image) + vOut.release() + command = ( + "ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" + % ( + os.path.join(args.pyaviPath, "video_only.avi"), + os.path.join(args.pyaviPath, "audio.wav"), + args.nDataLoaderThread, + os.path.join(args.pyaviPath, "video_out.avi"), + ) + ) + output = subprocess.call(command, shell=True, stdout=None) + + +def evaluate_col_ASD(tracks, scores, args): + txtPath = args.videoFolder + "/col_labels/fusion/*.txt" # Load labels + predictionSet = {} + for name in {"long", "bell", "boll", "lieb", "sick", "abbas"}: + predictionSet[name] = [[], []] + dictGT = {} + txtFiles = glob.glob("%s" % txtPath) + for file in txtFiles: + lines = open(file).read().splitlines() + idName = file.split("/")[-1][:-4] + for line in lines: + data = line.split("\t") + frame = int(int(data[0]) / 29.97 * 25) + x1 = int(data[1]) + y1 = int(data[2]) + x2 = int(data[1]) + int(data[3]) + y2 = int(data[2]) + int(data[3]) + gt = int(data[4]) + if frame in dictGT: + dictGT[frame].append([x1, y1, x2, y2, gt, idName]) + else: + dictGT[frame] = [[x1, y1, x2, y2, gt, idName]] + flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) # Load files + flist.sort() + faces = [[] for i in range(len(flist))] + for tidx, track in enumerate(tracks): + score = scores[tidx] + for fidx, frame in enumerate(track["track"]["frame"].tolist()): + s = numpy.mean( + score[max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)] + ) # average smoothing + faces[frame].append( + { + "track": tidx, + "score": float(s), + "s": track["proc_track"]["s"][fidx], + "x": track["proc_track"]["x"][fidx], + "y": track["proc_track"]["y"][fidx], + } + ) + for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)): + if fidx in dictGT: # This frame has label + for gtThisFrame in dictGT[fidx]: # What this label is ? + faceGT = gtThisFrame[0:4] + labelGT = gtThisFrame[4] + idGT = gtThisFrame[5] + ious = [] + for face in faces[fidx]: # Find the right face in my result + faceLocation = [ + int(face["x"] - face["s"]), + int(face["y"] - face["s"]), + int(face["x"] + face["s"]), + int(face["y"] + face["s"]), + ] + faceLocation_new = [ + int(face["x"] - face["s"]) // 2, + int(face["y"] - face["s"]) // 2, + int(face["x"] + face["s"]) // 2, + int(face["y"] + face["s"]) // 2, + ] + iou = bb_intersection_over_union( + faceLocation_new, faceGT, evalCol=True + ) + if iou > 0.5: + ious.append([iou, round(face["score"], 2)]) + if len(ious) > 0: # Find my result + ious.sort() + labelPredict = ious[-1][1] + else: + labelPredict = 0 + x1 = faceGT[0] + y1 = faceGT[1] + width = faceGT[2] - faceGT[0] + predictionSet[idGT][0].append(labelPredict) + predictionSet[idGT][1].append(labelGT) + names = ["long", "bell", "boll", "lieb", "sick", "abbas"] # Evaluate + names.sort() + F1s = 0 + for i in names: + scores = numpy.array(predictionSet[i][0]) + labels = numpy.array(predictionSet[i][1]) + scores = numpy.int64(scores > 0) + F1 = f1_score(labels, scores) + ACC = accuracy_score(labels, scores) + if i != "abbas": + F1s += F1 + print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1)) + print("Average F1:%.2f" % (100 * (F1s / 5))) + + +# Main function +def main(): + # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python). + # ``` + # . + # ├── pyavi + # │   ├── audio.wav (Audio from input video) + # │   ├── video.avi (Copy of the input video) + # │   ├── video_only.avi (Output video without audio) + # │   └── video_out.avi (Output video with audio) + # ├── pycrop (The detected face videos and audios) + # │ ├── 000000.avi + # │ ├── 000000.wav + # │ ├── 000001.avi + # │ ├── 000001.wav + # │ └── ... + # ├── pyframes (All the video frames in this video) + # │ ├── 000001.jpg + # │ ├── 000002.jpg + # │ └── ... + # └── pywork + # ├── faces.pckl (face detection result) + # ├── scene.pckl (scene detection result) + # ├── scores.pckl (ASD result) + # └── tracks.pckl (face tracking result) + # ``` + + # Initialization + args.pyaviPath = os.path.join(args.savePath, "pyavi") + args.pyframesPath = os.path.join(args.savePath, "pyframes") + args.pyworkPath = os.path.join(args.savePath, "pywork") + args.pycropPath = os.path.join(args.savePath, "pycrop") + if os.path.exists(args.savePath): + rmtree(args.savePath) + os.makedirs( + args.pyaviPath, exist_ok=True + ) # The path for the input video, input audio, output video + os.makedirs(args.pyframesPath, exist_ok=True) # Save all the video frames + os.makedirs( + args.pyworkPath, exist_ok=True + ) # Save the results in this process by the pckl method + os.makedirs( + args.pycropPath, exist_ok=True + ) # Save the detected face clips (audio+video) in this process + + # Extract video + args.videoFilePath = os.path.join(args.pyaviPath, "video.avi") + # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration' + if args.duration == 0: + command = ( + "ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic" + % (args.videoPath, args.nDataLoaderThread, args.videoFilePath) + ) + else: + command = ( + "ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic" + % ( + args.videoPath, + args.nDataLoaderThread, + args.start, + args.start + args.duration, + args.videoFilePath, + ) + ) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write( + time.strftime("%Y-%m-%d %H:%M:%S") + + " Extract the video and save in %s \r\n" % (args.videoFilePath) + ) + + # Extract audio + args.audioFilePath = os.path.join(args.pyaviPath, "audio.wav") + command = ( + "ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" + % (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath) + ) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write( + time.strftime("%Y-%m-%d %H:%M:%S") + + " Extract the audio and save in %s \r\n" % (args.audioFilePath) + ) + + # Extract the video frames + command = "ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % ( + args.videoFilePath, + args.nDataLoaderThread, + os.path.join(args.pyframesPath, "%06d.jpg"), + ) + subprocess.call(command, shell=True, stdout=None) + sys.stderr.write( + time.strftime("%Y-%m-%d %H:%M:%S") + + " Extract the frames and save in %s \r\n" % (args.pyframesPath) + ) + + # Scene detection for the video frames + scene = scene_detect(args) + sys.stderr.write( + time.strftime("%Y-%m-%d %H:%M:%S") + + " Scene detection and save in %s \r\n" % (args.pyworkPath) + ) + + # Face detection for the video frames + faces = inference_video(args) + sys.stderr.write( + time.strftime("%Y-%m-%d %H:%M:%S") + + " Face detection and save in %s \r\n" % (args.pyworkPath) + ) + + # Face tracking + allTracks, vidTracks = [], [] + for shot in scene: + if ( + shot[1].frame_num - shot[0].frame_num >= args.minTrack + ): # Discard the shot frames less than minTrack frames + allTracks.extend( + track_shot(args, faces[shot[0].frame_num : shot[1].frame_num]) + ) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces + sys.stderr.write( + time.strftime("%Y-%m-%d %H:%M:%S") + + " Face track and detected %d tracks \r\n" % len(allTracks) + ) + + # Face clips cropping + for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)): + vidTracks.append( + crop_video(args, track, os.path.join(args.pycropPath, "%05d" % ii)) + ) + savePath = os.path.join(args.pyworkPath, "tracks.pckl") + with open(savePath, "wb") as fil: + pickle.dump(vidTracks, fil) + sys.stderr.write( + time.strftime("%Y-%m-%d %H:%M:%S") + + " Face Crop and saved in %s tracks \r\n" % args.pycropPath + ) + fil = open(savePath, "rb") + vidTracks = pickle.load(fil) + + # Active Speaker Detection by TalkNet + files = glob.glob("%s/*.avi" % args.pycropPath) + files.sort() + scores = evaluate_network(files, args) + savePath = os.path.join(args.pyworkPath, "scores.pckl") + with open(savePath, "wb") as fil: + pickle.dump(scores, fil) + sys.stderr.write( + time.strftime("%Y-%m-%d %H:%M:%S") + + " Scores extracted and saved in %s \r\n" % args.pyworkPath + ) + + if args.evalCol == True: + evaluate_col_ASD( + vidTracks, scores, args + ) # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want + quit() + else: + # Visualization, save the result as the new video + visualization(vidTracks, scores, args) + + +if __name__ == "__main__": + main() diff --git a/talknet-asd/export_onnx_cpu.py b/talknet-asd/export_onnx_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..55cc58b7a8fd505b0ac75c06f2241fdce0ad51e2 --- /dev/null +++ b/talknet-asd/export_onnx_cpu.py @@ -0,0 +1,87 @@ +import os + +import torch + +from loss import lossAV +from model.talkNetModel import talkNetModel + + +class TalkNetCPU(torch.nn.Module): + """CPU-only wrapper for TalkNet export.""" + + def __init__(self, ckpt_path: str): + super().__init__() + self.model = talkNetModel() + self.lossAV = lossAV() + self.ckpt_path = ckpt_path + + def load_parameters(self) -> None: + """Load state_dict saved by talkNet.saveParameters (handles module. prefix).""" + self_state = self.state_dict() + loaded_state = torch.load(self.ckpt_path, map_location="cpu") + + for name, param in loaded_state.items(): + orig_name = name + target_name = name + if target_name not in self_state: + target_name = target_name.replace("module.", "") + if target_name not in self_state: + print(f"{orig_name} is not in the model.") + continue + if self_state[target_name].shape != loaded_state[orig_name].shape: + print( + f"Shape mismatch {orig_name}: " + f"model {self_state[target_name].shape}, " + f"loaded {loaded_state[orig_name].shape}" + ) + continue + self_state[target_name].copy_(param) + + def forward(self, audio_mfcc: torch.Tensor, video_gray: torch.Tensor) -> torch.Tensor: + """ + audio_mfcc: (B, Ta, 13) + video_gray: (B, Tv, 224, 224) + returns logits: (B*, 2) + """ + audio_embed = self.model.forward_audio_frontend(audio_mfcc) + visual_embed = self.model.forward_visual_frontend(video_gray) + audio_embed, visual_embed = self.model.forward_cross_attention( + audio_embed, visual_embed + ) + av_embed = self.model.forward_audio_visual_backend(audio_embed, visual_embed) + logits = self.lossAV.FC(av_embed) + return logits + + +def main() -> None: + ckpt_path = os.environ.get("CKPT_PATH", "model/pretrain_TalkSet.model") + out_path = os.environ.get("OUT_PATH", "talknet_asd_cpu.onnx") + + model = TalkNetCPU(ckpt_path) + model.load_parameters() + model.eval() + + # Dummy inputs only to build the graph; real lengths are dynamic via dynamic_axes. + dummy_audio = torch.randn(1, 100, 13) # ~1s MFCC (100 frames) + # Model expects 112x112 (demoTalkNet crops 224->center 112) + dummy_video = torch.randn(1, 25, 112, 112) # 25 frames of 112x112 gray crops + + torch.onnx.export( + model, + (dummy_audio, dummy_video), + out_path, + input_names=["audio_mfcc", "video_gray"], + output_names=["logits"], + dynamic_axes={ + "audio_mfcc": {0: "batch", 1: "time_audio"}, + "video_gray": {0: "batch", 1: "time_video"}, + "logits": {0: "time_any"}, + }, + opset_version=14, + ) + print(f"Saved ONNX to {out_path}") + + +if __name__ == "__main__": + main() + diff --git a/talknet-asd/loss.py b/talknet-asd/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..1a08c5bc1215272f86c2751f113070ae2769f30b --- /dev/null +++ b/talknet-asd/loss.py @@ -0,0 +1,50 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class lossAV(nn.Module): + def __init__(self): + super(lossAV, self).__init__() + self.criterion = nn.CrossEntropyLoss() + self.FC = nn.Linear(256, 2) + + def forward(self, x, labels=None): + x = x.squeeze(1) + x = self.FC(x) + if labels == None: + predScore = x[:,1] + predScore = predScore.t() + predScore = predScore.view(-1).detach().cpu().numpy() + return predScore + else: + nloss = self.criterion(x, labels) + predScore = F.softmax(x, dim = -1) + predLabel = torch.round(F.softmax(x, dim = -1))[:,1] + correctNum = (predLabel == labels).sum().float() + return nloss, predScore, predLabel, correctNum + +class lossA(nn.Module): + def __init__(self): + super(lossA, self).__init__() + self.criterion = nn.CrossEntropyLoss() + self.FC = nn.Linear(128, 2) + + def forward(self, x, labels): + x = x.squeeze(1) + x = self.FC(x) + nloss = self.criterion(x, labels) + return nloss + +class lossV(nn.Module): + def __init__(self): + super(lossV, self).__init__() + + self.criterion = nn.CrossEntropyLoss() + self.FC = nn.Linear(128, 2) + + def forward(self, x, labels): + x = x.squeeze(1) + x = self.FC(x) + nloss = self.criterion(x, labels) + return nloss + diff --git a/talknet-asd/model/attentionLayer.py b/talknet-asd/model/attentionLayer.py new file mode 100644 index 0000000000000000000000000000000000000000..257c8bf839f2715d299eae1dc7963418de7c01d1 --- /dev/null +++ b/talknet-asd/model/attentionLayer.py @@ -0,0 +1,36 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +from torch.nn import MultiheadAttention + +class attentionLayer(nn.Module): + + def __init__(self, d_model, nhead, dropout=0.1): + super(attentionLayer, self).__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) + + self.linear1 = nn.Linear(d_model, d_model * 4) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(d_model * 4, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = F.relu + + def forward(self, src, tar): + # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor + src = src.transpose(0, 1) # B, T, C -> T, B, C + tar = tar.transpose(0, 1) # B, T, C -> T, B, C + src2 = self.self_attn(tar, src, src, attn_mask=None, + key_padding_mask=None)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + src = src.transpose(0, 1) # T, B, C -> B, T, C + return src diff --git a/talknet-asd/model/audioEncoder.py b/talknet-asd/model/audioEncoder.py new file mode 100644 index 0000000000000000000000000000000000000000..6aaaf66b29d9453662bd20a918ebff35229f2966 --- /dev/null +++ b/talknet-asd/model/audioEncoder.py @@ -0,0 +1,108 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class SEBasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): + super(SEBasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.se = SELayer(planes, reduction) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.relu(out) + out = self.bn1(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + return out + +class SELayer(nn.Module): + def __init__(self, channel, reduction=8): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), + nn.Sigmoid() + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + +class audioEncoder(nn.Module): + def __init__(self, layers, num_filters, **kwargs): + super(audioEncoder, self).__init__() + block = SEBasicBlock + self.inplanes = num_filters[0] + + self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(num_filters[0]) + self.relu = nn.ReLU(inplace=True) + + self.layer1 = self._make_layer(block, num_filters[0], layers[0]) + self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2)) + self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2)) + self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1)) + out_dim = num_filters[3] * block.expansion + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = torch.mean(x, dim=2, keepdim=True) + x = x.view((x.size()[0], x.size()[1], -1)) + x = x.transpose(1, 2) + + return x \ No newline at end of file diff --git a/talknet-asd/model/faceDetector/README.md b/talknet-asd/model/faceDetector/README.md new file mode 100644 index 0000000000000000000000000000000000000000..24fd251044e72bb38611d8741feb28a7fcdbb28e --- /dev/null +++ b/talknet-asd/model/faceDetector/README.md @@ -0,0 +1,3 @@ +# Face detector + +This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`. diff --git a/talknet-asd/model/faceDetector/__init__.py b/talknet-asd/model/faceDetector/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..059d49bf0b8e8a17f641984e7d889e5b008257b9 --- /dev/null +++ b/talknet-asd/model/faceDetector/__init__.py @@ -0,0 +1 @@ +from .s3fd import S3FD \ No newline at end of file diff --git a/talknet-asd/model/faceDetector/s3fd/__init__.py b/talknet-asd/model/faceDetector/s3fd/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..24fc2491648541a92985ffb2cd9411e754446d27 --- /dev/null +++ b/talknet-asd/model/faceDetector/s3fd/__init__.py @@ -0,0 +1,66 @@ +import time, os, sys, subprocess +import numpy as np +import cv2 +import torch +from torchvision import transforms +from .nets import S3FDNet +from .box_utils import nms_ + +PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth' +if os.path.isfile(PATH_WEIGHT) == False: + Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt" + cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT) + subprocess.call(cmd, shell=True, stdout=None) +img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32') + + +class S3FD(): + + def __init__(self, device='cuda'): + + tstamp = time.time() + self.device = device + + # print('[S3FD] loading with', self.device) + self.net = S3FDNet(device=self.device).to(self.device) + PATH = os.path.join(os.getcwd(), PATH_WEIGHT) + state_dict = torch.load(PATH, map_location=self.device) + self.net.load_state_dict(state_dict) + self.net.eval() + # print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp)) + + def detect_faces(self, image, conf_th=0.8, scales=[1]): + + w, h = image.shape[1], image.shape[0] + + bboxes = np.empty(shape=(0, 5)) + + with torch.no_grad(): + for s in scales: + scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR) + + scaled_img = np.swapaxes(scaled_img, 1, 2) + scaled_img = np.swapaxes(scaled_img, 1, 0) + scaled_img = scaled_img[[2, 1, 0], :, :] + scaled_img = scaled_img.astype('float32') + scaled_img -= img_mean + scaled_img = scaled_img[[2, 1, 0], :, :] + x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device) + y = self.net(x) + + detections = y.data + scale = torch.Tensor([w, h, w, h]) + + for i in range(detections.size(1)): + j = 0 + while detections[0, i, j, 0] > conf_th: + score = detections[0, i, j, 0] + pt = (detections[0, i, j, 1:] * scale).cpu().numpy() + bbox = (pt[0], pt[1], pt[2], pt[3], score) + bboxes = np.vstack((bboxes, bbox)) + j += 1 + + keep = nms_(bboxes, 0.1) + bboxes = bboxes[keep] + + return bboxes diff --git a/talknet-asd/model/faceDetector/s3fd/box_utils.py b/talknet-asd/model/faceDetector/s3fd/box_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0fc3869318870c2ad45e62970fe991915f2b24fe --- /dev/null +++ b/talknet-asd/model/faceDetector/s3fd/box_utils.py @@ -0,0 +1,217 @@ +import numpy as np +from itertools import product as product +import torch +from torch.autograd import Function + + +def nms_(dets, thresh): + """ + Courtesy of Ross Girshick + [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py] + """ + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1) * (y2 - y1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(int(i)) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return np.array(keep).astype(int) + + +def decode(loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + + boxes = torch.cat(( + priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + + +def nms(boxes, scores, overlap=0.5, top_k=200): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + overlap: (float) The overlap thresh for suppressing unnecessary boxes. + top_k: (int) The Maximum number of box preds to consider. + Return: + The indices of the kept boxes with respect to num_priors. + """ + + keep = scores.new(scores.size(0)).zero_().long() + if boxes.numel() == 0: + return keep, 0 + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + area = torch.mul(x2 - x1, y2 - y1) + v, idx = scores.sort(0) # sort in ascending order + # I = I[v >= 0.01] + idx = idx[-top_k:] # indices of the top-k largest vals + xx1 = boxes.new() + yy1 = boxes.new() + xx2 = boxes.new() + yy2 = boxes.new() + w = boxes.new() + h = boxes.new() + + # keep = torch.Tensor() + count = 0 + while idx.numel() > 0: + i = idx[-1] # index of current largest val + # keep.append(i) + keep[count] = i + count += 1 + if idx.size(0) == 1: + break + idx = idx[:-1] # remove kept element from view + # load bboxes of next highest vals + torch.index_select(x1, 0, idx, out=xx1) + torch.index_select(y1, 0, idx, out=yy1) + torch.index_select(x2, 0, idx, out=xx2) + torch.index_select(y2, 0, idx, out=yy2) + # store element-wise max with next highest score + xx1 = torch.clamp(xx1, min=x1[i]) + yy1 = torch.clamp(yy1, min=y1[i]) + xx2 = torch.clamp(xx2, max=x2[i]) + yy2 = torch.clamp(yy2, max=y2[i]) + w.resize_as_(xx2) + h.resize_as_(yy2) + w = xx2 - xx1 + h = yy2 - yy1 + # check sizes of xx1 and xx2.. after each iteration + w = torch.clamp(w, min=0.0) + h = torch.clamp(h, min=0.0) + inter = w * h + # IoU = i / (area(a) + area(b) - i) + rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + union = (rem_areas - inter) + area[i] + IoU = inter / union # store result in iou + # keep only elements with an IoU <= overlap + idx = idx[IoU.le(overlap)] + return keep, count + + +class Detect(object): + + def __init__(self, num_classes=2, + top_k=750, nms_thresh=0.3, conf_thresh=0.05, + variance=[0.1, 0.2], nms_top_k=5000): + + self.num_classes = num_classes + self.top_k = top_k + self.nms_thresh = nms_thresh + self.conf_thresh = conf_thresh + self.variance = variance + self.nms_top_k = nms_top_k + + def forward(self, loc_data, conf_data, prior_data): + + num = loc_data.size(0) + num_priors = prior_data.size(0) + + conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) + batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4) + batch_priors = batch_priors.contiguous().view(-1, 4) + + decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance) + decoded_boxes = decoded_boxes.view(num, num_priors, 4) + + output = torch.zeros(num, self.num_classes, self.top_k, 5) + + for i in range(num): + boxes = decoded_boxes[i].clone() + conf_scores = conf_preds[i].clone() + + for cl in range(1, self.num_classes): + c_mask = conf_scores[cl].gt(self.conf_thresh) + scores = conf_scores[cl][c_mask] + + if scores.dim() == 0: + continue + l_mask = c_mask.unsqueeze(1).expand_as(boxes) + boxes_ = boxes[l_mask].view(-1, 4) + ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k) + count = count if count < self.top_k else self.top_k + + output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1) + + return output + + +class PriorBox(object): + + def __init__(self, input_size, feature_maps, + variance=[0.1, 0.2], + min_sizes=[16, 32, 64, 128, 256, 512], + steps=[4, 8, 16, 32, 64, 128], + clip=False): + + super(PriorBox, self).__init__() + + self.imh = input_size[0] + self.imw = input_size[1] + self.feature_maps = feature_maps + + self.variance = variance + self.min_sizes = min_sizes + self.steps = steps + self.clip = clip + + def forward(self): + mean = [] + for k, fmap in enumerate(self.feature_maps): + feath = fmap[0] + featw = fmap[1] + for i, j in product(range(feath), range(featw)): + f_kw = self.imw / self.steps[k] + f_kh = self.imh / self.steps[k] + + cx = (j + 0.5) / f_kw + cy = (i + 0.5) / f_kh + + s_kw = self.min_sizes[k] / self.imw + s_kh = self.min_sizes[k] / self.imh + + mean += [cx, cy, s_kw, s_kh] + + output = torch.FloatTensor(mean).view(-1, 4) + + if self.clip: + output.clamp_(max=1, min=0) + + return output diff --git a/talknet-asd/model/faceDetector/s3fd/nets.py b/talknet-asd/model/faceDetector/s3fd/nets.py new file mode 100644 index 0000000000000000000000000000000000000000..185f9d4f08cf1750998e13829c8d79001fa87700 --- /dev/null +++ b/talknet-asd/model/faceDetector/s3fd/nets.py @@ -0,0 +1,174 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from .box_utils import Detect, PriorBox + + +class L2Norm(nn.Module): + + def __init__(self, n_channels, scale): + super(L2Norm, self).__init__() + self.n_channels = n_channels + self.gamma = scale or None + self.eps = 1e-10 + self.weight = nn.Parameter(torch.Tensor(self.n_channels)) + self.reset_parameters() + + def reset_parameters(self): + init.constant_(self.weight, self.gamma) + + def forward(self, x): + norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps + x = torch.div(x, norm) + out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x + return out + + +class S3FDNet(nn.Module): + + def __init__(self, device='cuda'): + super(S3FDNet, self).__init__() + self.device = device + + self.vgg = nn.ModuleList([ + nn.Conv2d(3, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(64, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(64, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(128, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2, ceil_mode=True), + + nn.Conv2d(256, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6), + nn.ReLU(inplace=True), + nn.Conv2d(1024, 1024, 1, 1), + nn.ReLU(inplace=True), + ]) + + self.L2Norm3_3 = L2Norm(256, 10) + self.L2Norm4_3 = L2Norm(512, 8) + self.L2Norm5_3 = L2Norm(512, 5) + + self.extras = nn.ModuleList([ + nn.Conv2d(1024, 256, 1, 1), + nn.Conv2d(256, 512, 3, 2, padding=1), + nn.Conv2d(512, 128, 1, 1), + nn.Conv2d(128, 256, 3, 2, padding=1), + ]) + + self.loc = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(1024, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(256, 4, 3, 1, padding=1), + ]) + + self.conf = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(1024, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(256, 2, 3, 1, padding=1), + ]) + + self.softmax = nn.Softmax(dim=-1) + self.detect = Detect() + + def forward(self, x): + size = x.size()[2:] + sources = list() + loc = list() + conf = list() + + for k in range(16): + x = self.vgg[k](x) + s = self.L2Norm3_3(x) + sources.append(s) + + for k in range(16, 23): + x = self.vgg[k](x) + s = self.L2Norm4_3(x) + sources.append(s) + + for k in range(23, 30): + x = self.vgg[k](x) + s = self.L2Norm5_3(x) + sources.append(s) + + for k in range(30, len(self.vgg)): + x = self.vgg[k](x) + sources.append(x) + + # apply extra layers and cache source layer outputs + for k, v in enumerate(self.extras): + x = F.relu(v(x), inplace=True) + if k % 2 == 1: + sources.append(x) + + # apply multibox head to source layers + loc_x = self.loc[0](sources[0]) + conf_x = self.conf[0](sources[0]) + + max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) + conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) + + loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) + conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) + + for i in range(1, len(sources)): + x = sources[i] + conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous()) + loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous()) + + features_maps = [] + for i in range(len(loc)): + feat = [] + feat += [loc[i].size(1), loc[i].size(2)] + features_maps += [feat] + + loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) + conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) + + with torch.no_grad(): + self.priorbox = PriorBox(size, features_maps) + self.priors = self.priorbox.forward() + + output = self.detect.forward( + loc.view(loc.size(0), -1, 4), + self.softmax(conf.view(conf.size(0), -1, 2)), + self.priors.type(type(x.data)).to(self.device) + ) + + return output diff --git a/talknet-asd/model/talkNetModel.py b/talknet-asd/model/talkNetModel.py new file mode 100644 index 0000000000000000000000000000000000000000..64cab3a827e661a71c54024611af815de6c1d8c8 --- /dev/null +++ b/talknet-asd/model/talkNetModel.py @@ -0,0 +1,64 @@ +import torch +import torch.nn as nn + +from model.audioEncoder import audioEncoder +from model.visualEncoder import visualFrontend, visualTCN, visualConv1D +from model.attentionLayer import attentionLayer + +class talkNetModel(nn.Module): + def __init__(self): + super(talkNetModel, self).__init__() + # Visual Temporal Encoder + self.visualFrontend = visualFrontend() # Visual Frontend + # self.visualFrontend.load_state_dict(torch.load('visual_frontend.pt', map_location="cuda")) + # for param in self.visualFrontend.parameters(): + # param.requires_grad = False + self.visualTCN = visualTCN() # Visual Temporal Network TCN + self.visualConv1D = visualConv1D() # Visual Temporal Network Conv1d + + # Audio Temporal Encoder + self.audioEncoder = audioEncoder(layers = [3, 4, 6, 3], num_filters = [16, 32, 64, 128]) + + # Audio-visual Cross Attention + self.crossA2V = attentionLayer(d_model = 128, nhead = 8) + self.crossV2A = attentionLayer(d_model = 128, nhead = 8) + + # Audio-visual Self Attention + self.selfAV = attentionLayer(d_model = 256, nhead = 8) + + def forward_visual_frontend(self, x): + B, T, W, H = x.shape + x = x.view(B*T, 1, 1, W, H) + x = (x / 255 - 0.4161) / 0.1688 + x = self.visualFrontend(x) + x = x.view(B, T, 512) + x = x.transpose(1,2) + x = self.visualTCN(x) + x = self.visualConv1D(x) + x = x.transpose(1,2) + return x + + def forward_audio_frontend(self, x): + x = x.unsqueeze(1).transpose(2, 3) + x = self.audioEncoder(x) + return x + + def forward_cross_attention(self, x1, x2): + x1_c = self.crossA2V(src = x1, tar = x2) + x2_c = self.crossV2A(src = x2, tar = x1) + return x1_c, x2_c + + def forward_audio_visual_backend(self, x1, x2): + x = torch.cat((x1,x2), 2) + x = self.selfAV(src = x, tar = x) + x = torch.reshape(x, (-1, 256)) + return x + + def forward_audio_backend(self,x): + x = torch.reshape(x, (-1, 128)) + return x + + def forward_visual_backend(self,x): + x = torch.reshape(x, (-1, 128)) + return x + diff --git a/talknet-asd/model/visualEncoder.py b/talknet-asd/model/visualEncoder.py new file mode 100644 index 0000000000000000000000000000000000000000..e05f2cbd2378047e13b0357aea213ad26d0d13c7 --- /dev/null +++ b/talknet-asd/model/visualEncoder.py @@ -0,0 +1,172 @@ +## +# ResNet18 Pretrained network to extract lip embedding +# This code is modified based on https://github.com/lordmartian/deep_avsr +## + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ResNetLayer(nn.Module): + + """ + A ResNet layer used to build the ResNet network. + Architecture: + --> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu --> + | | | | + -----> downsample ------> -------------------------------------> + """ + + def __init__(self, inplanes, outplanes, stride): + super(ResNetLayer, self).__init__() + self.conv1a = nn.Conv2d(inplanes, outplanes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001) + self.conv2a = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.stride = stride + self.downsample = nn.Conv2d(inplanes, outplanes, kernel_size=(1,1), stride=stride, bias=False) + self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001) + + self.conv1b = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001) + self.conv2b = nn.Conv2d(outplanes, outplanes, kernel_size=3, stride=1, padding=1, bias=False) + self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001) + return + + + def forward(self, inputBatch): + batch = F.relu(self.bn1a(self.conv1a(inputBatch))) + batch = self.conv2a(batch) + if self.stride == 1: + residualBatch = inputBatch + else: + residualBatch = self.downsample(inputBatch) + batch = batch + residualBatch + intermediateBatch = batch + batch = F.relu(self.outbna(batch)) + + batch = F.relu(self.bn1b(self.conv1b(batch))) + batch = self.conv2b(batch) + residualBatch = intermediateBatch + batch = batch + residualBatch + outputBatch = F.relu(self.outbnb(batch)) + return outputBatch + + + +class ResNet(nn.Module): + + """ + An 18-layer ResNet architecture. + """ + + def __init__(self): + super(ResNet, self).__init__() + self.layer1 = ResNetLayer(64, 64, stride=1) + self.layer2 = ResNetLayer(64, 128, stride=2) + self.layer3 = ResNetLayer(128, 256, stride=2) + self.layer4 = ResNetLayer(256, 512, stride=2) + self.avgpool = nn.AvgPool2d(kernel_size=(4,4), stride=(1,1)) + + return + + + def forward(self, inputBatch): + batch = self.layer1(inputBatch) + batch = self.layer2(batch) + batch = self.layer3(batch) + batch = self.layer4(batch) + outputBatch = self.avgpool(batch) + return outputBatch + + +class GlobalLayerNorm(nn.Module): + def __init__(self, channel_size): + super(GlobalLayerNorm, self).__init__() + self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.reset_parameters() + + def reset_parameters(self): + self.gamma.data.fill_(1) + self.beta.data.zero_() + + def forward(self, y): + mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) #[M, 1, 1] + var = (torch.pow(y-mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) + gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta + return gLN_y + +class visualFrontend(nn.Module): + + """ + A visual feature extraction module. Generates a 512-dim feature vector per video frame. + Architecture: A 3D convolution block followed by an 18-layer ResNet. + """ + + def __init__(self): + super(visualFrontend, self).__init__() + self.frontend3D = nn.Sequential( + nn.Conv3d(1, 64, kernel_size=(5,7,7), stride=(1,2,2), padding=(2,3,3), bias=False), + nn.BatchNorm3d(64, momentum=0.01, eps=0.001), + nn.ReLU(), + nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)) + ) + self.resnet = ResNet() + return + + + def forward(self, inputBatch): + inputBatch = inputBatch.transpose(0, 1).transpose(1, 2) + batchsize = inputBatch.shape[0] + batch = self.frontend3D(inputBatch) + + batch = batch.transpose(1, 2) + batch = batch.reshape(batch.shape[0]*batch.shape[1], batch.shape[2], batch.shape[3], batch.shape[4]) + outputBatch = self.resnet(batch) + outputBatch = outputBatch.reshape(batchsize, -1, 512) + outputBatch = outputBatch.transpose(1 ,2) + outputBatch = outputBatch.transpose(1, 2).transpose(0, 1) + return outputBatch + +class DSConv1d(nn.Module): + def __init__(self): + super(DSConv1d, self).__init__() + self.net = nn.Sequential( + nn.ReLU(), + nn.BatchNorm1d(512), + nn.Conv1d(512, 512, 3, stride=1, padding=1,dilation=1, groups=512, bias=False), + nn.PReLU(), + GlobalLayerNorm(512), + nn.Conv1d(512, 512, 1, bias=False), + ) + + def forward(self, x): + out = self.net(x) + return out + x + +class visualTCN(nn.Module): + def __init__(self): + super(visualTCN, self).__init__() + stacks = [] + for x in range(5): + stacks += [DSConv1d()] + self.net = nn.Sequential(*stacks) # Visual Temporal Network V-TCN + + def forward(self, x): + out = self.net(x) + return out + +class visualConv1D(nn.Module): + def __init__(self): + super(visualConv1D, self).__init__() + self.net = nn.Sequential( + nn.Conv1d(512, 256, 5, stride=1, padding=2), + nn.BatchNorm1d(256), + nn.ReLU(), + nn.Conv1d(256, 128, 1), + ) + + def forward(self, x): + out = self.net(x) + return out \ No newline at end of file diff --git a/talknet-asd/predict.py b/talknet-asd/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..dd552dc4d2eede08a59423ad87f4a81d0982ea23 --- /dev/null +++ b/talknet-asd/predict.py @@ -0,0 +1,201 @@ +import os +import cv2 +import json +import glob +import pickle +import shutil +import subprocess +from typing import List, Optional +from cog import BasePredictor, BaseModel, Input, Path + + +class Output(BaseModel): + media_path: Optional[List[Path]] + json_str: Optional[str] + + +class Predictor(BasePredictor): + def setup(self): + pass + + def predict( + self, + video: Path = Input(description="Path to the video"), + face_det_scale: float = Input( + default=0.25, + description="Scale factor for face detection, the frames will be scaled to 0.25 of the original", + ge=0, + le=1, + ), + min_track: int = Input( + default=10, description="Number of min frames for each shot" + ), + num_failed_det: int = Input( + default=10, + description="Number of missed detections allowed before tracking is stopped", + ge=1, + ), + min_face_size: int = Input( + default=1, description="Minimum face size in pixels", ge=1 + ), + crop_scale: float = Input( + default=0.40, description="Scale bounding box", ge=0, le=1 + ), + start: int = Input(default=0, description="The start time of the video", ge=0), + duration: int = Input( + default=-1, + description="The duration of the video, when set as -1, will extract the whole video", + ), + return_json: bool = Input( + description="Return results in json format", default=True + ), + return_boundingbox_percentages: bool = Input( + description="Return bounding box coordinates as percentages of the video width and height", + default=False, + ), + ) -> Output: + + video_path = str(video) + video_name = os.path.splitext(os.path.basename(video_path))[0] + video_folder = "demo" + + # Clean up and create the video folder + shutil.rmtree(video_folder, ignore_errors=True) + os.makedirs(video_folder, exist_ok=True) + + # Copy the input video to the video folder + target_video_path = os.path.join(video_folder, os.path.basename(video_path)) + shutil.copy(video_path, target_video_path) + + duration = max(0, duration) + n_data_loader_thread = 32 + + # Run the demoTalkNet.py script with the provided arguments + command = ( + f"python demoTalkNet.py --videoName {video_name} " + f"--videoFolder {video_folder} " + f"--pretrainModel pretrain_TalkSet.model " + f"--nDataLoaderThread {n_data_loader_thread} " + f"--facedetScale {face_det_scale} " + f"--minTrack {min_track} " + f"--numFailedDet {num_failed_det} " + f"--minFaceSize {min_face_size} " + f"--cropScale {crop_scale} " + f"--start {start} " + f"--duration {duration} " + ) + + process = subprocess.Popen( + command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + print(f"Command output: {stdout.decode()}") + if stderr: + print(f"Command errors: {stderr.decode()}") + + # Find the most recent pywork folder + pywork_folders = glob.glob(os.path.join(video_folder, "*", "pywork")) + latest_pywork_folder = max(pywork_folders, key=os.path.getctime) + + # Load the face tracks and scores from the pickle files generated by demoTalkNet.py + tracks_file = os.path.join(latest_pywork_folder, "tracks.pckl") + scores_file = os.path.join(latest_pywork_folder, "scores.pckl") + with open(tracks_file, "rb") as f: + face_tracks = pickle.load(f) # list + with open(scores_file, "rb") as f: + scores = pickle.load(f) # list + + # Get the video dimensions + video = cv2.VideoCapture(target_video_path) + video_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) + video_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) + video.release() + + # Convert face tracks and scores to the desired JSON format + output_data = [] + for track_idx, track in enumerate(face_tracks): + # Get the frame numbers for the current track + frames = track["track"]["frame"] + + # Get the bounding box information for the current track + boxes = track["proc_track"] + + # Get the speaking scores for the current track + # If the track index is out of range, use an empty list + speaking_scores = scores[track_idx] if track_idx < len(scores) else [] + + for i, frame in enumerate(frames): + # Check if the current index is within the valid range of the bounding box information + # If not, break the loop and move to the next track + if i >= len(boxes["x"]) or i >= len(boxes["y"]) or i >= len(boxes["s"]): + break + + # Calculate bounding box coordinates + x0 = int(boxes["x"][i] - boxes["s"][i]) + y0 = int(boxes["y"][i] - boxes["s"][i]) + x1 = int(boxes["x"][i] + boxes["s"][i]) + y1 = int(boxes["y"][i] + boxes["s"][i]) + + # Normalize the bounding box coordinates if required + if return_boundingbox_percentages: + x0 /= video_width + y0 /= video_height + x1 /= video_width + y1 /= video_height + + # Determine speaking status + speaking = ( + bool(speaking_scores[i] >= 0) if i < len(speaking_scores) else False + ) + + # Create the bounding box dictionary + box = { + "face_id": track_idx, + "x0": x0, + "y0": y0, + "x1": x1, + "y1": y1, + "speaking": speaking, + } + + # Create a dictionary for each frame if it doesn't exist + frame_data = next( + ( + data + for data in output_data + if data["frame_number"] == int(frame) + ), + None, + ) + if frame_data is None: + frame_data = {"frame_number": int(frame), "faces": []} + output_data.append(frame_data) + + # Add the current face's bounding box and speaking status to the frame's data + frame_data["faces"].append(box) + + # Convert the output data to JSON string + json_str = json.dumps(output_data) + + if return_json: + return Output(json_str=json_str) + else: + mp4_files = [] + excluded_files = ["video_only.avi", "video.avi"] + avi_files = [ + avi_file + for avi_file in Path(video_folder).rglob("*.avi") + if avi_file.name not in excluded_files + ] + for avi_file in avi_files: + mp4_file = avi_file.with_suffix(".mp4") + conversion_command = f"ffmpeg -i {avi_file} {mp4_file}" + conversion_process = subprocess.run( + conversion_command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if conversion_process.returncode == 0: + mp4_files.append(Path(mp4_file)) + return Output(media_path=mp4_files) diff --git a/talknet-asd/sanity_check.ipynb b/talknet-asd/sanity_check.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e3f6f299e07abad6110a0aa648fc86fe01a2306a --- /dev/null +++ b/talknet-asd/sanity_check.ipynb @@ -0,0 +1,9219 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import cv2\n", + "import json\n", + "import shutil\n", + "import requests\n", + "import tempfile\n", + "from predict import Predictor\n", + "from IPython.display import Video" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def download_video(video_url):\n", + " response = requests.get(video_url)\n", + " temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=\".mp4\")\n", + " temp_file.write(response.content)\n", + " temp_file.close()\n", + " return temp_file.name\n", + "\n", + "\n", + "def extract_audio(video_path: str, output_audio_path: str) -> bool:\n", + " print(f\"[~] Extracting audio from {video_path} to {output_audio_path}\")\n", + " command = f\"ffmpeg -i {video_path} -vn -ar 44100 -ac 2 -ab 192k -f mp3 {output_audio_path}\"\n", + " os.system(command)\n", + " return check_audio_extraction_success(output_audio_path)\n", + "\n", + "\n", + "def check_audio_extraction_success(output_audio_path: str) -> bool:\n", + " if os.path.exists(output_audio_path) and os.path.getsize(output_audio_path) > 0:\n", + " return True\n", + " else:\n", + " print(\"[!] No audio stream found in the video or extraction failed.\")\n", + " return False\n", + "\n", + "\n", + "def process_video_frames(video, data, output_frames_pattern):\n", + " for frame_data in data:\n", + " frame_number = frame_data[\"frame_number\"]\n", + " video.set(cv2.CAP_PROP_POS_FRAMES, frame_number)\n", + " ret, frame = video.read()\n", + " if ret:\n", + " draw_faces(frame, frame_data[\"faces\"])\n", + " cv2.imwrite(output_frames_pattern % frame_number, frame)\n", + "\n", + "\n", + "def draw_faces(frame, faces):\n", + " for face in faces:\n", + " x0, y0, x1, y1 = face[\"x0\"], face[\"y0\"], face[\"x1\"], face[\"y1\"]\n", + " GREEN = (0, 255, 0)\n", + " RED = (0, 0, 255) # BGR so red is last channel\n", + " color = GREEN if face[\"speaking\"] else RED\n", + " cv2.rectangle(\n", + " frame,\n", + " (x0, y0),\n", + " (x1, y1),\n", + " color,\n", + " 10,\n", + " )\n", + " cv2.putText(\n", + " frame,\n", + " f\"Face {face['face_id']}\",\n", + " (x0, y0 - 10),\n", + " cv2.FONT_HERSHEY_SIMPLEX,\n", + " 0.9,\n", + " color,\n", + " 2,\n", + " )\n", + "\n", + "\n", + "def visualize_output(video_path, json_output):\n", + " data = json.loads(json_output)\n", + " temp_dir = tempfile.mkdtemp()\n", + " output_frames_pattern = os.path.join(temp_dir, \"frame_%05d.png\")\n", + " video = cv2.VideoCapture(video_path)\n", + " fps = int(video.get(cv2.CAP_PROP_FPS))\n", + " process_video_frames(video, data, output_frames_pattern)\n", + " video.release()\n", + " audio_path = os.path.join(temp_dir, \"audio.mp3\")\n", + " has_audio = extract_audio(video_path, audio_path)\n", + " output_video_path = generate_output_video(\n", + " fps,\n", + " output_frames_pattern,\n", + " audio_path,\n", + " has_audio,\n", + " )\n", + " shutil.rmtree(temp_dir)\n", + " return output_video_path\n", + "\n", + "\n", + "def generate_output_video(fps, output_frames_pattern, audio_path, has_audio):\n", + " output_video_path = \"output_video.mp4\"\n", + " if has_audio:\n", + " ffmpeg_cmd = f\"ffmpeg -y -framerate {fps} -i {output_frames_pattern} -i {audio_path} -c:v libx264 -pix_fmt yuv420p -c:a aac -strict experimental {output_video_path}\"\n", + " else:\n", + " ffmpeg_cmd = f\"ffmpeg -y -framerate {fps} -i {output_frames_pattern} -c:v libx264 -pix_fmt yuv420p {output_video_path}\"\n", + " os.system(ffmpeg_cmd)\n", + " return output_video_path" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input video = /tmp/tmpmao5i_kv.mp4\n" + ] + } + ], + "source": [ + "# Download the video and save it to a temporary file\n", + "video_url = \"https://replicate.delivery/pbxt/KUjguhc1e9L8mC40dd8Ub8lnLiQtjOLYvgsuuWNbQmLmfmXX/Untitled.mp4\"\n", + "video_path = download_video(video_url)\n", + "print(f\"Input video = {video_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Command output: 05-01 17:34:34 Model para number = 15.01\n", + "\n", + "Command errors: 2024-05-01 17:33:58 Extract the video and save in demo/tmpmao5i_kv/pyavi/video.avi \n", + "2024-05-01 17:33:58 Extract the audio and save in demo/tmpmao5i_kv/pyavi/audio.wav \n", + "2024-05-01 17:34:01 Extract the frames and save in demo/tmpmao5i_kv/pyframes \n", + "VideoManager is deprecated and will be removed.\n", + "`base_timecode` argument is deprecated and has no effect.\n", + "demo/tmpmao5i_kv/pyavi/video.avi - scenes detected 1\n", + "2024-05-01 17:34:01 Scene detection and save in demo/tmpmao5i_kv/pywork \n", + "2024-05-01 17:34:20 Face detection and save in demo/tmpmao5i_kv/pywork \n", + "2024-05-01 17:34:20 Face track and detected 2 tracks \n", + "100%|██████████| 2/2 [00:13<00:00, 6.81s/it]\n", + "2024-05-01 17:34:33 Face Crop and saved in demo/tmpmao5i_kv/pycrop tracks \n", + "Model pretrain_TalkSet.model loaded from previous state! \n", + "100%|██████████| 2/2 [00:01<00:00, 1.12it/s]\n", + "2024-05-01 17:34:35 Scores extracted and saved in demo/tmpmao5i_kv/pywork \n", + "100%|██████████| 438/438 [00:10<00:00, 43.19it/s]\n", + "\n" + ] + } + ], + "source": [ + "# Create an instance of the Predictor class\n", + "p = Predictor()\n", + "\n", + "# Run the prediction on the downloaded video\n", + "result = p.predict(\n", + " video=video_path,\n", + " start=0,\n", + " duration=0,\n", + " min_track=10,\n", + " crop_scale=0.4,\n", + " min_face_size=1,\n", + " face_det_scale=0.25,\n", + " num_failed_det=10,\n", + " return_json=True,\n", + " return_boundingbox_percentages=False, # Return pixel coords of faces\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " {\n", + " \"frame_number\": 0,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 120,\n", + " \"x1\": 450,\n", + " \"y1\": 283,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 1,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 120,\n", + " \"x1\": 450,\n", + " \"y1\": 283,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 2,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 120,\n", + " \"x1\": 450,\n", + " \"y1\": 283,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 3,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 120,\n", + " \"x1\": 451,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 4,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 120,\n", + " \"x1\": 452,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 5,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 291,\n", + " \"y0\": 120,\n", + " \"x1\": 454,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 6,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 294,\n", + " \"y0\": 120,\n", + " \"x1\": 458,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 7,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 299,\n", + " \"y0\": 120,\n", + " \"x1\": 460,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 8,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 299,\n", + " \"y0\": 119,\n", + " \"x1\": 459,\n", + " \"y1\": 279,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 9,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 299,\n", + " \"y0\": 117,\n", + " \"x1\": 459,\n", + " \"y1\": 277,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 10,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 299,\n", + " \"y0\": 115,\n", + " \"x1\": 459,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1605,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 11,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 299,\n", + " \"y0\": 113,\n", + " \"x1\": 459,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1605,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 12,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 299,\n", + " \"y0\": 113,\n", + " \"x1\": 459,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 13,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 299,\n", + " \"y0\": 112,\n", + " \"x1\": 459,\n", + " \"y1\": 271,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 14,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 299,\n", + " \"y0\": 112,\n", + " \"x1\": 459,\n", + " \"y1\": 271,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 15,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 458,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 16,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 458,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 17,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 458,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 18,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 297,\n", + " \"y0\": 111,\n", + " \"x1\": 459,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 19,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 297,\n", + " \"y0\": 111,\n", + " \"x1\": 459,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 20,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 297,\n", + " \"y0\": 111,\n", + " \"x1\": 459,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 21,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 297,\n", + " \"y0\": 112,\n", + " \"x1\": 459,\n", + " \"y1\": 274,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 22,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 297,\n", + " \"y0\": 112,\n", + " \"x1\": 460,\n", + " \"y1\": 274,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 23,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 460,\n", + " \"y1\": 274,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 24,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 461,\n", + " \"y1\": 274,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 25,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 461,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 26,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 461,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 27,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 461,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 28,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 461,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 29,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 298,\n", + " \"y0\": 112,\n", + " \"x1\": 461,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 30,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 297,\n", + " \"y0\": 112,\n", + " \"x1\": 460,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 31,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 297,\n", + " \"y0\": 112,\n", + " \"x1\": 460,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 32,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 297,\n", + " \"y0\": 112,\n", + " \"x1\": 460,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 33,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 296,\n", + " \"y0\": 112,\n", + " \"x1\": 459,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 34,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 295,\n", + " \"y0\": 112,\n", + " \"x1\": 458,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 35,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 294,\n", + " \"y0\": 112,\n", + " \"x1\": 457,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 36,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 293,\n", + " \"y0\": 112,\n", + " \"x1\": 457,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 37,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 292,\n", + " \"y0\": 112,\n", + " \"x1\": 456,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 38,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 292,\n", + " \"y0\": 111,\n", + " \"x1\": 456,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 39,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 292,\n", + " \"y0\": 111,\n", + " \"x1\": 456,\n", + " \"y1\": 276,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 40,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 292,\n", + " \"y0\": 111,\n", + " \"x1\": 456,\n", + " \"y1\": 276,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 41,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 292,\n", + " \"y0\": 111,\n", + " \"x1\": 456,\n", + " \"y1\": 276,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 42,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 291,\n", + " \"y0\": 111,\n", + " \"x1\": 456,\n", + " \"y1\": 276,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 43,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 291,\n", + " \"y0\": 111,\n", + " \"x1\": 456,\n", + " \"y1\": 276,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 44,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 291,\n", + " \"y0\": 111,\n", + " \"x1\": 455,\n", + " \"y1\": 276,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 45,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 111,\n", + " \"x1\": 455,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 46,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 454,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 47,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 454,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 48,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 454,\n", + " \"y1\": 274,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 49,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 454,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 50,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 453,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 51,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 453,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 52,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 453,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 53,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 453,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 54,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 453,\n", + " \"y1\": 273,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 55,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 110,\n", + " \"x1\": 451,\n", + " \"y1\": 272,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 56,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 110,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 57,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 110,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 87,\n", + " \"x1\": 1603,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 58,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 85,\n", + " \"x1\": 1603,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 59,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 81,\n", + " \"x1\": 1603,\n", + " \"y1\": 256,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 60,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 61,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 76,\n", + " \"x1\": 1608,\n", + " \"y1\": 253,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 62,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 76,\n", + " \"x1\": 1610,\n", + " \"y1\": 254,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 63,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 76,\n", + " \"x1\": 1610,\n", + " \"y1\": 254,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 64,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 76,\n", + " \"x1\": 1610,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 65,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 109,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 77,\n", + " \"x1\": 1610,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 66,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 109,\n", + " \"x1\": 450,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 77,\n", + " \"x1\": 1610,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 67,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 109,\n", + " \"x1\": 450,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 77,\n", + " \"x1\": 1609,\n", + " \"y1\": 253,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 68,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 109,\n", + " \"x1\": 450,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 77,\n", + " \"x1\": 1609,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 69,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 110,\n", + " \"x1\": 450,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 78,\n", + " \"x1\": 1609,\n", + " \"y1\": 254,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 70,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 110,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 78,\n", + " \"x1\": 1609,\n", + " \"y1\": 254,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 71,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 110,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 79,\n", + " \"x1\": 1608,\n", + " \"y1\": 255,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 72,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 110,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 81,\n", + " \"x1\": 1607,\n", + " \"y1\": 257,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 73,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 110,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 82,\n", + " \"x1\": 1605,\n", + " \"y1\": 258,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 74,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 110,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 84,\n", + " \"x1\": 1604,\n", + " \"y1\": 258,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 75,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 289,\n", + " \"y0\": 110,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 84,\n", + " \"x1\": 1604,\n", + " \"y1\": 259,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 76,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 451,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 84,\n", + " \"x1\": 1604,\n", + " \"y1\": 259,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 77,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 109,\n", + " \"x1\": 450,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 85,\n", + " \"x1\": 1604,\n", + " \"y1\": 259,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 78,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 108,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 85,\n", + " \"x1\": 1604,\n", + " \"y1\": 259,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 79,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 102,\n", + " \"x1\": 445,\n", + " \"y1\": 265,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 85,\n", + " \"x1\": 1604,\n", + " \"y1\": 259,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 80,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 97,\n", + " \"x1\": 442,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 85,\n", + " \"x1\": 1604,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 81,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 274,\n", + " \"y0\": 94,\n", + " \"x1\": 437,\n", + " \"y1\": 257,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 85,\n", + " \"x1\": 1604,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 82,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 271,\n", + " \"y0\": 92,\n", + " \"x1\": 434,\n", + " \"y1\": 255,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 85,\n", + " \"x1\": 1604,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 83,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 270,\n", + " \"y0\": 90,\n", + " \"x1\": 433,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 85,\n", + " \"x1\": 1603,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 84,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 270,\n", + " \"y0\": 90,\n", + " \"x1\": 433,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 85,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 270,\n", + " \"y0\": 90,\n", + " \"x1\": 433,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 86,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 270,\n", + " \"y0\": 90,\n", + " \"x1\": 433,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 87,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 270,\n", + " \"y0\": 90,\n", + " \"x1\": 432,\n", + " \"y1\": 252,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1603,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 88,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 270,\n", + " \"y0\": 90,\n", + " \"x1\": 432,\n", + " \"y1\": 252,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 89,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 270,\n", + " \"y0\": 90,\n", + " \"x1\": 432,\n", + " \"y1\": 252,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 90,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 272,\n", + " \"y0\": 90,\n", + " \"x1\": 434,\n", + " \"y1\": 252,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 91,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 274,\n", + " \"y0\": 91,\n", + " \"x1\": 436,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 92,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 276,\n", + " \"y0\": 93,\n", + " \"x1\": 439,\n", + " \"y1\": 255,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 93,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 94,\n", + " \"x1\": 441,\n", + " \"y1\": 256,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 94,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 281,\n", + " \"y0\": 95,\n", + " \"x1\": 443,\n", + " \"y1\": 257,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 95,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 96,\n", + " \"x1\": 445,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 86,\n", + " \"x1\": 1601,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 96,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 96,\n", + " \"x1\": 447,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 97,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 98,\n", + " \"x1\": 448,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 98,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 99,\n", + " \"x1\": 448,\n", + " \"y1\": 261,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 99,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 261,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 85,\n", + " \"x1\": 1602,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 100,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 261,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 84,\n", + " \"x1\": 1602,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 101,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 83,\n", + " \"x1\": 1602,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 102,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 81,\n", + " \"x1\": 1602,\n", + " \"y1\": 257,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 103,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 78,\n", + " \"x1\": 1604,\n", + " \"y1\": 254,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 104,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 78,\n", + " \"x1\": 1605,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 105,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 99,\n", + " \"x1\": 448,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 106,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 447,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 107,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 446,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 108,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 98,\n", + " \"x1\": 445,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 109,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 281,\n", + " \"y0\": 98,\n", + " \"x1\": 444,\n", + " \"y1\": 261,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 110,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 111,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 98,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 112,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 77,\n", + " \"x1\": 1606,\n", + " \"y1\": 253,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 113,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 81,\n", + " \"x1\": 1606,\n", + " \"y1\": 257,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 114,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 84,\n", + " \"x1\": 1606,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 115,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 86,\n", + " \"x1\": 1605,\n", + " \"y1\": 261,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 116,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 86,\n", + " \"x1\": 1604,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 117,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 118,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 86,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 119,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 120,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 121,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 122,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 87,\n", + " \"x1\": 1602,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 123,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 88,\n", + " \"x1\": 1602,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 124,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 88,\n", + " \"x1\": 1604,\n", + " \"y1\": 263,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 125,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 87,\n", + " \"x1\": 1606,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 126,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 87,\n", + " \"x1\": 1609,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 127,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 86,\n", + " \"x1\": 1612,\n", + " \"y1\": 265,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 128,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 86,\n", + " \"x1\": 1614,\n", + " \"y1\": 265,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 129,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1436,\n", + " \"y0\": 85,\n", + " \"x1\": 1617,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 130,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 85,\n", + " \"x1\": 1619,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 131,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 84,\n", + " \"x1\": 1622,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 132,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 83,\n", + " \"x1\": 1623,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 133,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 83,\n", + " \"x1\": 1624,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 134,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 83,\n", + " \"x1\": 1624,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 135,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 83,\n", + " \"x1\": 1624,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 136,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 83,\n", + " \"x1\": 1624,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 137,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 82,\n", + " \"x1\": 1624,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 138,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 82,\n", + " \"x1\": 1624,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 139,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1441,\n", + " \"y0\": 83,\n", + " \"x1\": 1623,\n", + " \"y1\": 265,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 140,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 83,\n", + " \"x1\": 1623,\n", + " \"y1\": 264,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 141,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1622,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 142,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1622,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 143,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1622,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 144,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 145,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 146,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 147,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 148,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 149,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 264,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 150,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 151,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 152,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 153,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 154,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 155,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1442,\n", + " \"y0\": 84,\n", + " \"x1\": 1621,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 156,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1441,\n", + " \"y0\": 84,\n", + " \"x1\": 1622,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 157,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1441,\n", + " \"y0\": 83,\n", + " \"x1\": 1621,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 158,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 83,\n", + " \"x1\": 1622,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 159,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 82,\n", + " \"x1\": 1621,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 160,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 81,\n", + " \"x1\": 1620,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 161,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 81,\n", + " \"x1\": 1620,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 162,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 81,\n", + " \"x1\": 1618,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 163,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 81,\n", + " \"x1\": 1616,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 164,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 81,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 165,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 82,\n", + " \"x1\": 1615,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 166,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 83,\n", + " \"x1\": 1614,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 167,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 83,\n", + " \"x1\": 1612,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 168,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1424,\n", + " \"y0\": 85,\n", + " \"x1\": 1610,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 169,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1421,\n", + " \"y0\": 86,\n", + " \"x1\": 1607,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 170,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1418,\n", + " \"y0\": 89,\n", + " \"x1\": 1604,\n", + " \"y1\": 275,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 171,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1414,\n", + " \"y0\": 96,\n", + " \"x1\": 1600,\n", + " \"y1\": 281,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 172,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1410,\n", + " \"y0\": 103,\n", + " \"x1\": 1595,\n", + " \"y1\": 288,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 173,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1410,\n", + " \"y0\": 109,\n", + " \"x1\": 1594,\n", + " \"y1\": 293,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 174,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1407,\n", + " \"y0\": 113,\n", + " \"x1\": 1591,\n", + " \"y1\": 297,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 175,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1407,\n", + " \"y0\": 117,\n", + " \"x1\": 1590,\n", + " \"y1\": 300,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 176,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1405,\n", + " \"y0\": 120,\n", + " \"x1\": 1588,\n", + " \"y1\": 303,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 177,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 123,\n", + " \"x1\": 1585,\n", + " \"y1\": 306,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 178,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 126,\n", + " \"x1\": 1584,\n", + " \"y1\": 309,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 179,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 126,\n", + " \"x1\": 1584,\n", + " \"y1\": 308,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 180,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 126,\n", + " \"x1\": 1584,\n", + " \"y1\": 308,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 181,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 126,\n", + " \"x1\": 1584,\n", + " \"y1\": 309,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 182,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 444,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 126,\n", + " \"x1\": 1584,\n", + " \"y1\": 309,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 183,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 126,\n", + " \"x1\": 1584,\n", + " \"y1\": 309,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 184,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 126,\n", + " \"x1\": 1584,\n", + " \"y1\": 309,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 185,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 126,\n", + " \"x1\": 1584,\n", + " \"y1\": 309,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 186,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1401,\n", + " \"y0\": 125,\n", + " \"x1\": 1586,\n", + " \"y1\": 310,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 187,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1401,\n", + " \"y0\": 124,\n", + " \"x1\": 1587,\n", + " \"y1\": 310,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 188,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 100,\n", + " \"x1\": 444,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1401,\n", + " \"y0\": 123,\n", + " \"x1\": 1588,\n", + " \"y1\": 310,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 189,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 99,\n", + " \"x1\": 446,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 120,\n", + " \"x1\": 1590,\n", + " \"y1\": 308,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 190,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 98,\n", + " \"x1\": 446,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1402,\n", + " \"y0\": 119,\n", + " \"x1\": 1591,\n", + " \"y1\": 308,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 191,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 98,\n", + " \"x1\": 447,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1403,\n", + " \"y0\": 119,\n", + " \"x1\": 1593,\n", + " \"y1\": 309,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 192,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 98,\n", + " \"x1\": 447,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1404,\n", + " \"y0\": 118,\n", + " \"x1\": 1594,\n", + " \"y1\": 308,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 193,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 98,\n", + " \"x1\": 447,\n", + " \"y1\": 263,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1405,\n", + " \"y0\": 117,\n", + " \"x1\": 1596,\n", + " \"y1\": 308,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 194,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 97,\n", + " \"x1\": 447,\n", + " \"y1\": 261,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1406,\n", + " \"y0\": 116,\n", + " \"x1\": 1597,\n", + " \"y1\": 307,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 195,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 97,\n", + " \"x1\": 447,\n", + " \"y1\": 261,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1407,\n", + " \"y0\": 115,\n", + " \"x1\": 1598,\n", + " \"y1\": 306,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 196,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 95,\n", + " \"x1\": 447,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1407,\n", + " \"y0\": 114,\n", + " \"x1\": 1598,\n", + " \"y1\": 305,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 197,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 95,\n", + " \"x1\": 447,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1408,\n", + " \"y0\": 113,\n", + " \"x1\": 1599,\n", + " \"y1\": 304,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 198,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 95,\n", + " \"x1\": 447,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1409,\n", + " \"y0\": 112,\n", + " \"x1\": 1600,\n", + " \"y1\": 303,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 199,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 94,\n", + " \"x1\": 447,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1410,\n", + " \"y0\": 111,\n", + " \"x1\": 1601,\n", + " \"y1\": 302,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 200,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 94,\n", + " \"x1\": 447,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1412,\n", + " \"y0\": 111,\n", + " \"x1\": 1603,\n", + " \"y1\": 302,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 201,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 94,\n", + " \"x1\": 447,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1412,\n", + " \"y0\": 111,\n", + " \"x1\": 1603,\n", + " \"y1\": 301,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 202,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 94,\n", + " \"x1\": 446,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1412,\n", + " \"y0\": 111,\n", + " \"x1\": 1602,\n", + " \"y1\": 301,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 203,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 94,\n", + " \"x1\": 446,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1413,\n", + " \"y0\": 111,\n", + " \"x1\": 1602,\n", + " \"y1\": 299,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 204,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 94,\n", + " \"x1\": 446,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1414,\n", + " \"y0\": 110,\n", + " \"x1\": 1600,\n", + " \"y1\": 296,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 205,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 94,\n", + " \"x1\": 446,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 110,\n", + " \"x1\": 1600,\n", + " \"y1\": 295,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 206,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 95,\n", + " \"x1\": 445,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 107,\n", + " \"x1\": 1600,\n", + " \"y1\": 292,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 207,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 95,\n", + " \"x1\": 445,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 105,\n", + " \"x1\": 1600,\n", + " \"y1\": 290,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 208,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 95,\n", + " \"x1\": 445,\n", + " \"y1\": 258,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 103,\n", + " \"x1\": 1600,\n", + " \"y1\": 288,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 209,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 95,\n", + " \"x1\": 445,\n", + " \"y1\": 258,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 103,\n", + " \"x1\": 1600,\n", + " \"y1\": 288,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 210,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 95,\n", + " \"x1\": 445,\n", + " \"y1\": 258,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 103,\n", + " \"x1\": 1600,\n", + " \"y1\": 288,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 211,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 96,\n", + " \"x1\": 445,\n", + " \"y1\": 258,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 103,\n", + " \"x1\": 1600,\n", + " \"y1\": 288,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 212,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 97,\n", + " \"x1\": 446,\n", + " \"y1\": 260,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 103,\n", + " \"x1\": 1600,\n", + " \"y1\": 287,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 213,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 98,\n", + " \"x1\": 448,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 103,\n", + " \"x1\": 1600,\n", + " \"y1\": 287,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 214,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 261,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 102,\n", + " \"x1\": 1600,\n", + " \"y1\": 287,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 215,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 102,\n", + " \"x1\": 1600,\n", + " \"y1\": 287,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 216,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 101,\n", + " \"x1\": 1600,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 217,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 99,\n", + " \"x1\": 449,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1417,\n", + " \"y0\": 98,\n", + " \"x1\": 1598,\n", + " \"y1\": 279,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 218,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 99,\n", + " \"x1\": 450,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1419,\n", + " \"y0\": 95,\n", + " \"x1\": 1599,\n", + " \"y1\": 274,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 219,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 99,\n", + " \"x1\": 450,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1420,\n", + " \"y0\": 94,\n", + " \"x1\": 1599,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 220,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 100,\n", + " \"x1\": 450,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1420,\n", + " \"y0\": 93,\n", + " \"x1\": 1599,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 221,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 100,\n", + " \"x1\": 450,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1421,\n", + " \"y0\": 90,\n", + " \"x1\": 1600,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 222,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 100,\n", + " \"x1\": 450,\n", + " \"y1\": 262,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1421,\n", + " \"y0\": 89,\n", + " \"x1\": 1600,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 223,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 288,\n", + " \"y0\": 102,\n", + " \"x1\": 450,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1421,\n", + " \"y0\": 89,\n", + " \"x1\": 1600,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 224,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 104,\n", + " \"x1\": 452,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1421,\n", + " \"y0\": 89,\n", + " \"x1\": 1600,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 225,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 104,\n", + " \"x1\": 452,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1421,\n", + " \"y0\": 88,\n", + " \"x1\": 1600,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 226,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 104,\n", + " \"x1\": 451,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1420,\n", + " \"y0\": 88,\n", + " \"x1\": 1600,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 227,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 104,\n", + " \"x1\": 451,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1420,\n", + " \"y0\": 87,\n", + " \"x1\": 1599,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 228,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 104,\n", + " \"x1\": 451,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1419,\n", + " \"y0\": 87,\n", + " \"x1\": 1599,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 229,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 104,\n", + " \"x1\": 451,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1419,\n", + " \"y0\": 86,\n", + " \"x1\": 1600,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 230,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 290,\n", + " \"y0\": 104,\n", + " \"x1\": 451,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1419,\n", + " \"y0\": 86,\n", + " \"x1\": 1600,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 231,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 103,\n", + " \"x1\": 449,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1418,\n", + " \"y0\": 86,\n", + " \"x1\": 1599,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 232,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 103,\n", + " \"x1\": 446,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1418,\n", + " \"y0\": 86,\n", + " \"x1\": 1598,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 233,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 103,\n", + " \"x1\": 446,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1416,\n", + " \"y0\": 86,\n", + " \"x1\": 1597,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 234,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 103,\n", + " \"x1\": 446,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1416,\n", + " \"y0\": 86,\n", + " \"x1\": 1597,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 235,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 103,\n", + " \"x1\": 446,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1415,\n", + " \"y0\": 86,\n", + " \"x1\": 1596,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 236,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 103,\n", + " \"x1\": 446,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1413,\n", + " \"y0\": 86,\n", + " \"x1\": 1594,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 237,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 103,\n", + " \"x1\": 446,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1413,\n", + " \"y0\": 86,\n", + " \"x1\": 1594,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 238,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 103,\n", + " \"x1\": 446,\n", + " \"y1\": 264,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1413,\n", + " \"y0\": 91,\n", + " \"x1\": 1594,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 239,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 104,\n", + " \"x1\": 447,\n", + " \"y1\": 265,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1413,\n", + " \"y0\": 93,\n", + " \"x1\": 1594,\n", + " \"y1\": 274,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 240,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 449,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1413,\n", + " \"y0\": 95,\n", + " \"x1\": 1594,\n", + " \"y1\": 276,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 241,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 449,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1413,\n", + " \"y0\": 97,\n", + " \"x1\": 1594,\n", + " \"y1\": 278,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 242,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 449,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1413,\n", + " \"y0\": 97,\n", + " \"x1\": 1594,\n", + " \"y1\": 278,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 243,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 449,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1414,\n", + " \"y0\": 97,\n", + " \"x1\": 1595,\n", + " \"y1\": 278,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 244,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 449,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1417,\n", + " \"y0\": 97,\n", + " \"x1\": 1597,\n", + " \"y1\": 278,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 245,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 449,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1419,\n", + " \"y0\": 97,\n", + " \"x1\": 1599,\n", + " \"y1\": 278,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 246,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 449,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1423,\n", + " \"y0\": 98,\n", + " \"x1\": 1602,\n", + " \"y1\": 277,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 247,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 98,\n", + " \"x1\": 1606,\n", + " \"y1\": 277,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 248,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 97,\n", + " \"x1\": 1610,\n", + " \"y1\": 275,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 249,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1436,\n", + " \"y0\": 96,\n", + " \"x1\": 1614,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 250,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 448,\n", + " \"y1\": 265,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 94,\n", + " \"x1\": 1615,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 251,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 448,\n", + " \"y1\": 265,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 94,\n", + " \"x1\": 1615,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 252,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 265,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 94,\n", + " \"x1\": 1616,\n", + " \"y1\": 271,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 253,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 265,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 94,\n", + " \"x1\": 1616,\n", + " \"y1\": 270,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 254,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 93,\n", + " \"x1\": 1616,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 255,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 104,\n", + " \"x1\": 448,\n", + " \"y1\": 265,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 93,\n", + " \"x1\": 1616,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 256,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 93,\n", + " \"x1\": 1616,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 257,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 93,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 258,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 93,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 259,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 93,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 260,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 93,\n", + " \"x1\": 1614,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 261,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1436,\n", + " \"y0\": 96,\n", + " \"x1\": 1612,\n", + " \"y1\": 273,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 262,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 102,\n", + " \"x1\": 1610,\n", + " \"y1\": 277,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 263,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 104,\n", + " \"x1\": 1610,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 264,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 104,\n", + " \"x1\": 1610,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 265,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 104,\n", + " \"x1\": 1610,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 266,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 104,\n", + " \"x1\": 1611,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 267,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 104,\n", + " \"x1\": 1611,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 268,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 104,\n", + " \"x1\": 1611,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 269,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 104,\n", + " \"x1\": 1613,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 270,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 105,\n", + " \"x1\": 1614,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 271,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 105,\n", + " \"x1\": 1614,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 272,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 104,\n", + " \"x1\": 1614,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 273,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 104,\n", + " \"x1\": 1614,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 274,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 104,\n", + " \"x1\": 1614,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 275,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 105,\n", + " \"x1\": 1614,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 276,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 105,\n", + " \"x1\": 1614,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 277,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 104,\n", + " \"x1\": 1615,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 278,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 104,\n", + " \"x1\": 1615,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 279,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 104,\n", + " \"x1\": 1615,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 280,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 282,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 281,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 282,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 283,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1438,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 284,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 285,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 286,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 284,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 287,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1436,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 284,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 288,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1436,\n", + " \"y0\": 106,\n", + " \"x1\": 1615,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 289,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1436,\n", + " \"y0\": 105,\n", + " \"x1\": 1615,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 290,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 106,\n", + " \"x1\": 1615,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 291,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 106,\n", + " \"x1\": 1615,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 292,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 106,\n", + " \"x1\": 1615,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 293,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 106,\n", + " \"x1\": 1615,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 294,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 106,\n", + " \"x1\": 1615,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 295,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 106,\n", + " \"x1\": 1615,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 296,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 287,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 106,\n", + " \"x1\": 1614,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 297,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 106,\n", + " \"x1\": 1614,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 298,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 106,\n", + " \"x1\": 1614,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 299,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 106,\n", + " \"x1\": 1613,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 300,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 107,\n", + " \"x1\": 1612,\n", + " \"y1\": 284,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 301,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1611,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 302,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1436,\n", + " \"y0\": 108,\n", + " \"x1\": 1610,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 303,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1610,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 304,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 272,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1610,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 305,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 272,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1610,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 306,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 272,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1610,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 307,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 272,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1610,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 308,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 272,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1610,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 309,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1610,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 310,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 108,\n", + " \"x1\": 1609,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 311,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 108,\n", + " \"x1\": 1609,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 312,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 109,\n", + " \"x1\": 1608,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 313,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 109,\n", + " \"x1\": 1608,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 314,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 109,\n", + " \"x1\": 1607,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 315,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 109,\n", + " \"x1\": 1607,\n", + " \"y1\": 284,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 316,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 110,\n", + " \"x1\": 1606,\n", + " \"y1\": 284,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 317,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 111,\n", + " \"x1\": 1606,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 318,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 450,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 112,\n", + " \"x1\": 1605,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 319,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 112,\n", + " \"x1\": 1605,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 320,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 112,\n", + " \"x1\": 1605,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 321,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 112,\n", + " \"x1\": 1604,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 322,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 112,\n", + " \"x1\": 1604,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 323,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 112,\n", + " \"x1\": 1603,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 324,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 112,\n", + " \"x1\": 1603,\n", + " \"y1\": 286,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 325,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 112,\n", + " \"x1\": 1603,\n", + " \"y1\": 286,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 326,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 112,\n", + " \"x1\": 1604,\n", + " \"y1\": 286,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 327,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 112,\n", + " \"x1\": 1604,\n", + " \"y1\": 286,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 328,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 112,\n", + " \"x1\": 1604,\n", + " \"y1\": 286,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 329,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 112,\n", + " \"x1\": 1604,\n", + " \"y1\": 286,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 330,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 111,\n", + " \"x1\": 1605,\n", + " \"y1\": 287,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 331,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 109,\n", + " \"x1\": 1605,\n", + " \"y1\": 286,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 332,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 109,\n", + " \"x1\": 1605,\n", + " \"y1\": 286,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 333,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 107,\n", + " \"x1\": 1605,\n", + " \"y1\": 285,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 334,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 271,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 107,\n", + " \"x1\": 1605,\n", + " \"y1\": 284,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 335,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 106,\n", + " \"x1\": 1605,\n", + " \"y1\": 283,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 336,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 105,\n", + " \"x1\": 1605,\n", + " \"y1\": 282,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 337,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 104,\n", + " \"x1\": 1605,\n", + " \"y1\": 282,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 338,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 104,\n", + " \"x1\": 1605,\n", + " \"y1\": 281,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 339,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 104,\n", + " \"x1\": 1604,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 340,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 104,\n", + " \"x1\": 1604,\n", + " \"y1\": 280,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 341,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 103,\n", + " \"x1\": 1604,\n", + " \"y1\": 279,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 342,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1428,\n", + " \"y0\": 101,\n", + " \"x1\": 1604,\n", + " \"y1\": 277,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 343,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 270,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 98,\n", + " \"x1\": 1605,\n", + " \"y1\": 274,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 344,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 96,\n", + " \"x1\": 1605,\n", + " \"y1\": 272,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 345,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1429,\n", + " \"y0\": 92,\n", + " \"x1\": 1605,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 346,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1427,\n", + " \"y0\": 89,\n", + " \"x1\": 1607,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 347,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1426,\n", + " \"y0\": 86,\n", + " \"x1\": 1609,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 348,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1430,\n", + " \"y0\": 84,\n", + " \"x1\": 1612,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 349,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 83,\n", + " \"x1\": 1616,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 350,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 83,\n", + " \"x1\": 1618,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 351,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 83,\n", + " \"x1\": 1620,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 352,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 83,\n", + " \"x1\": 1620,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 353,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 83,\n", + " \"x1\": 1620,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 354,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 83,\n", + " \"x1\": 1620,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 355,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 83,\n", + " \"x1\": 1620,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 356,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 83,\n", + " \"x1\": 1620,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 357,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 83,\n", + " \"x1\": 1620,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 358,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1436,\n", + " \"y0\": 83,\n", + " \"x1\": 1619,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 359,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 84,\n", + " \"x1\": 1618,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 360,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 85,\n", + " \"x1\": 1617,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 361,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 85,\n", + " \"x1\": 1616,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 362,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 85,\n", + " \"x1\": 1616,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 363,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 364,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 365,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 366,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 367,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 368,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 369,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 370,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 371,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 372,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 373,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 86,\n", + " \"x1\": 1615,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 374,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 375,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 376,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 102,\n", + " \"x1\": 446,\n", + " \"y1\": 265,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 84,\n", + " \"x1\": 1615,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 377,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 281,\n", + " \"y0\": 99,\n", + " \"x1\": 445,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 84,\n", + " \"x1\": 1614,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 378,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 280,\n", + " \"y0\": 99,\n", + " \"x1\": 443,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 84,\n", + " \"x1\": 1614,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 379,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 99,\n", + " \"x1\": 442,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1614,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 380,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 99,\n", + " \"x1\": 442,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1614,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 381,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 99,\n", + " \"x1\": 442,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1614,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 382,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 99,\n", + " \"x1\": 442,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1614,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 383,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 99,\n", + " \"x1\": 442,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1614,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 384,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 99,\n", + " \"x1\": 442,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1614,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 385,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 279,\n", + " \"y0\": 100,\n", + " \"x1\": 442,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 386,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 281,\n", + " \"y0\": 101,\n", + " \"x1\": 443,\n", + " \"y1\": 264,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 387,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 282,\n", + " \"y0\": 102,\n", + " \"x1\": 445,\n", + " \"y1\": 265,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 388,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 104,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 389,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 106,\n", + " \"x1\": 447,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 390,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 391,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 449,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 392,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 393,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 85,\n", + " \"x1\": 1615,\n", + " \"y1\": 269,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 394,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 85,\n", + " \"x1\": 1614,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 395,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 106,\n", + " \"x1\": 448,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 85,\n", + " \"x1\": 1614,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 396,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 87,\n", + " \"x1\": 1613,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 397,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 286,\n", + " \"y0\": 105,\n", + " \"x1\": 448,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 86,\n", + " \"x1\": 1613,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 398,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 285,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1434,\n", + " \"y0\": 84,\n", + " \"x1\": 1612,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 399,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 81,\n", + " \"x1\": 1613,\n", + " \"y1\": 259,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 400,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1437,\n", + " \"y0\": 78,\n", + " \"x1\": 1615,\n", + " \"y1\": 256,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 401,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 75,\n", + " \"x1\": 1618,\n", + " \"y1\": 254,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 402,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 75,\n", + " \"x1\": 1620,\n", + " \"y1\": 254,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 403,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 75,\n", + " \"x1\": 1620,\n", + " \"y1\": 254,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 404,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 268,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 75,\n", + " \"x1\": 1620,\n", + " \"y1\": 254,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 405,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 75,\n", + " \"x1\": 1620,\n", + " \"y1\": 254,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 406,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 447,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 75,\n", + " \"x1\": 1620,\n", + " \"y1\": 254,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 407,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 447,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 75,\n", + " \"x1\": 1620,\n", + " \"y1\": 255,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 408,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 447,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1440,\n", + " \"y0\": 75,\n", + " \"x1\": 1621,\n", + " \"y1\": 256,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 409,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 447,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1439,\n", + " \"y0\": 75,\n", + " \"x1\": 1621,\n", + " \"y1\": 257,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 410,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 447,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1435,\n", + " \"y0\": 75,\n", + " \"x1\": 1622,\n", + " \"y1\": 262,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 411,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 447,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1433,\n", + " \"y0\": 75,\n", + " \"x1\": 1623,\n", + " \"y1\": 265,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 412,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 447,\n", + " \"y1\": 268,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 76,\n", + " \"x1\": 1622,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 413,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 76,\n", + " \"x1\": 1622,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 414,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 76,\n", + " \"x1\": 1622,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 415,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 446,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1432,\n", + " \"y0\": 76,\n", + " \"x1\": 1622,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 416,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 283,\n", + " \"y0\": 105,\n", + " \"x1\": 445,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 76,\n", + " \"x1\": 1622,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 417,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 445,\n", + " \"y1\": 267,\n", + " \"speaking\": true\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 76,\n", + " \"x1\": 1621,\n", + " \"y1\": 266,\n", + " \"speaking\": true\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"frame_number\": 418,\n", + " \"faces\": [\n", + " {\n", + " \"face_id\": 0,\n", + " \"x0\": 284,\n", + " \"y0\": 105,\n", + " \"x1\": 445,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " },\n", + " {\n", + " \"face_id\": 1,\n", + " \"x0\": 1431,\n", + " \"y0\": 76,\n", + " \"x1\": 1621,\n", + " \"y1\": 266,\n", + " \"speaking\": false\n", + " }\n", + " ]\n", + " }\n", + "]\n" + ] + } + ], + "source": [ + "json_output = result.json_str\n", + "data = json.loads(json_output)\n", + "print(json.dumps(data, indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[~] Extracting audio from /tmp/tmpmao5i_kv.mp4 to /tmp/tmpuopx0_wy/audio.mp3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers\n", + " built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)\n", + " configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\n", + " libavutil 56. 31.100 / 56. 31.100\n", + " libavcodec 58. 54.100 / 58. 54.100\n", + " libavformat 58. 29.100 / 58. 29.100\n", + " libavdevice 58. 8.100 / 58. 8.100\n", + " libavfilter 7. 57.100 / 7. 57.100\n", + " libavresample 4. 0. 0 / 4. 0. 0\n", + " libswscale 5. 5.100 / 5. 5.100\n", + " libswresample 3. 5.100 / 3. 5.100\n", + " libpostproc 55. 5.100 / 55. 5.100\n", + "Input #0, mov,mp4,m4a,3gp,3g2,mj2, from '/tmp/tmpmao5i_kv.mp4':\n", + " Metadata:\n", + " major_brand : mp42\n", + " minor_version : 1\n", + " compatible_brands: isommp41mp42\n", + " creation_time : 2024-03-01T16:25:52.000000Z\n", + " Duration: 00:00:17.47, start: 0.000000, bitrate: 1257 kb/s\n", + " Chapter #0:0: start 0.000000, end 17.469000\n", + " Metadata:\n", + " title : New video game recommendation?\n", + " Chapter #0:1: start 28.000000, end 28.000000\n", + " Metadata:\n", + " title : Elon to Sam - Do you play games?\n", + " Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(tv, bt709), 1920x1080 [SAR 1:1 DAR 16:9], 1121 kb/s, 23.93 fps, 23.98 tbr, 24k tbn, 47.95 tbc (default)\n", + " Metadata:\n", + " creation_time : 2024-03-01T16:25:52.000000Z\n", + " handler_name : Core Media Video\n", + " Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 128 kb/s (default)\n", + " Metadata:\n", + " creation_time : 2024-03-01T16:25:52.000000Z\n", + " handler_name : Core Media Audio\n", + "Stream mapping:\n", + " Stream #0:1 -> #0:0 (aac (native) -> mp3 (libmp3lame))\n", + "Press [q] to stop, [?] for help\n", + "Output #0, mp3, to '/tmp/tmpuopx0_wy/audio.mp3':\n", + " Metadata:\n", + " major_brand : mp42\n", + " minor_version : 1\n", + " compatible_brands: isommp41mp42\n", + " TSSE : Lavf58.29.100\n", + " Chapter #0:0: start 0.000000, end 17.469000\n", + " Metadata:\n", + " TIT2 : New video game recommendation?\n", + " Chapter #0:1: start 28.000000, end 28.000000\n", + " Metadata:\n", + " TIT2 : Elon to Sam - Do you play games?\n", + " Stream #0:0(eng): Audio: mp3 (libmp3lame), 44100 Hz, stereo, fltp, 192 kb/s (default)\n", + " Metadata:\n", + " creation_time : 2024-03-01T16:25:52.000000Z\n", + " handler_name : Core Media Audio\n", + " encoder : Lavc58.54.100 libmp3lame\n", + "size= 412kB time=00:00:17.50 bitrate= 192.7kbits/s speed= 116x \n", + "video:0kB audio:411kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.222975%\n", + "ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers\n", + " built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)\n", + " configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\n", + " libavutil 56. 31.100 / 56. 31.100\n", + " libavcodec 58. 54.100 / 58. 54.100\n", + " libavformat 58. 29.100 / 58. 29.100\n", + " libavdevice 58. 8.100 / 58. 8.100\n", + " libavfilter 7. 57.100 / 7. 57.100\n", + " libavresample 4. 0. 0 / 4. 0. 0\n", + " libswscale 5. 5.100 / 5. 5.100\n", + " libswresample 3. 5.100 / 3. 5.100\n", + " libpostproc 55. 5.100 / 55. 5.100\n", + "Input #0, image2, from '/tmp/tmpuopx0_wy/frame_%05d.png':\n", + " Duration: 00:00:18.17, start: 0.000000, bitrate: N/A\n", + " Stream #0:0: Video: png, rgb24(pc), 1920x1080, 23 fps, 23 tbr, 23 tbn, 23 tbc\n", + "[mp3 @ 0x558822a75800] Skipping 612 bytes of junk at 326.\n", + "[mp3 @ 0x558822a75800] Estimating duration from bitrate, this may be inaccurate\n", + "Input #1, mp3, from '/tmp/tmpuopx0_wy/audio.mp3':\n", + " Metadata:\n", + " major_brand : mp42\n", + " minor_version : 1\n", + " compatible_brands: isommp41mp42\n", + " encoder : Lavf58.29.100\n", + " Duration: 00:00:17.53, start: 0.000000, bitrate: 192 kb/s\n", + " Chapter #1:0: start 0.000000, end 17.469000\n", + " Metadata:\n", + " title : New video game recommendation?\n", + " Chapter #1:1: start 28.000000, end 28.000000\n", + " Metadata:\n", + " title : Elon to Sam - Do you play games?\n", + " Stream #1:0: Audio: mp3, 44100 Hz, stereo, fltp, 192 kb/s\n", + "Stream mapping:\n", + " Stream #0:0 -> #0:0 (png (native) -> h264 (libx264))\n", + " Stream #1:0 -> #0:1 (mp3 (mp3float) -> aac (native))\n", + "Press [q] to stop, [?] for help\n", + "[image2 @ 0x558822a6d9c0] Thread message queue blocking; consider raising the thread_queue_size option (current value: 8)\n", + "[libx264 @ 0x558822a7a800] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2\n", + "[libx264 @ 0x558822a7a800] profile High, level 4.0\n", + "[libx264 @ 0x558822a7a800] 264 - core 155 r2917 0a84d98 - H.264/MPEG-4 AVC codec - Copyleft 2003-2018 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=24 lookahead_threads=4 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=23 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00\n", + "Output #0, mp4, to 'output_video.mp4':\n", + " Metadata:\n", + " encoder : Lavf58.29.100\n", + " Chapter #0:0: start 0.000000, end 17.469000\n", + " Metadata:\n", + " title : New video game recommendation?\n", + " Chapter #0:1: start 28.000000, end 28.000000\n", + " Metadata:\n", + " title : Elon to Sam - Do you play games?\n", + " Stream #0:0: Video: h264 (libx264) (avc1 / 0x31637661), yuv420p(progressive), 1920x1080, q=-1--1, 23 fps, 11776 tbn, 23 tbc\n", + " Metadata:\n", + " encoder : Lavc58.54.100 libx264\n", + " Side data:\n", + " cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: -1\n", + " Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 128 kb/s\n", + " Metadata:\n", + " encoder : Lavc58.54.100 aac\n", + "frame= 418 fps= 92 q=28.0 size= 3072kB time=00:00:15.09 bitrate=1667.5kbits/s speed=3.32x \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output video = output_video.mp4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "frame= 418 fps= 88 q=-1.0 Lsize= 3660kB time=00:00:18.04 bitrate=1661.8kbits/s speed= 3.8x \n", + "video:3370kB audio:276kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.385679%\n", + "[libx264 @ 0x558822a7a800] frame I:2 Avg QP:18.64 size:156782\n", + "[libx264 @ 0x558822a7a800] frame P:154 Avg QP:20.22 size: 16164\n", + "[libx264 @ 0x558822a7a800] frame B:262 Avg QP:26.12 size: 2470\n", + "[libx264 @ 0x558822a7a800] consecutive B-frames: 1.0% 38.3% 24.4% 36.4%\n", + "[libx264 @ 0x558822a7a800] mb I I16..4: 10.5% 63.7% 25.8%\n", + "[libx264 @ 0x558822a7a800] mb P I16..4: 0.6% 1.7% 0.4% P16..4: 21.3% 7.1% 4.3% 0.0% 0.0% skip:64.6%\n", + "[libx264 @ 0x558822a7a800] mb B I16..4: 0.1% 0.1% 0.0% B16..8: 15.8% 1.0% 0.2% direct: 0.3% skip:82.6% L0:35.1% L1:61.2% BI: 3.6%\n", + "[libx264 @ 0x558822a7a800] 8x8 transform intra:63.3% inter:66.9%\n", + "[libx264 @ 0x558822a7a800] coded y,uvDC,uvAC intra: 42.3% 29.3% 9.0% inter: 3.9% 2.5% 0.3%\n", + "[libx264 @ 0x558822a7a800] i16 v,h,dc,p: 29% 29% 11% 31%\n", + "[libx264 @ 0x558822a7a800] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 21% 22% 31% 3% 5% 5% 6% 3% 5%\n", + "[libx264 @ 0x558822a7a800] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 25% 36% 10% 3% 6% 5% 6% 3% 5%\n", + "[libx264 @ 0x558822a7a800] i8c dc,h,v,p: 62% 20% 15% 3%\n", + "[libx264 @ 0x558822a7a800] Weighted P-Frames: Y:0.0% UV:0.0%\n", + "[libx264 @ 0x558822a7a800] ref P L0: 69.8% 12.7% 12.6% 4.9%\n", + "[libx264 @ 0x558822a7a800] ref B L0: 84.2% 13.8% 2.0%\n", + "[libx264 @ 0x558822a7a800] ref B L1: 98.0% 2.0%\n", + "[libx264 @ 0x558822a7a800] kb/s:1518.69\n", + "[aac @ 0x558822aa4900] Qavg: 1121.730\n" + ] + } + ], + "source": [ + "# Visualize the output and get the path to the output video\n", + "output_video_path = visualize_output(video_path, json_output)\n", + "print(f\"Output video = {output_video_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Display the output video in Jupyter Notebook\n", + "Video(output_video_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "talknet", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/talknet-asd/talkNet.py b/talknet-asd/talkNet.py new file mode 100644 index 0000000000000000000000000000000000000000..7333e32b55c9f41149ab62b737c56552fc269cab --- /dev/null +++ b/talknet-asd/talkNet.py @@ -0,0 +1,94 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import sys, time, numpy, os, subprocess, pandas, tqdm + +from loss import lossAV, lossA, lossV +from model.talkNetModel import talkNetModel + +class talkNet(nn.Module): + def __init__(self, lr = 0.0001, lrDecay = 0.95, **kwargs): + super(talkNet, self).__init__() + self.model = talkNetModel().cuda() + self.lossAV = lossAV().cuda() + self.lossA = lossA().cuda() + self.lossV = lossV().cuda() + self.optim = torch.optim.Adam(self.parameters(), lr = lr) + self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size = 1, gamma=lrDecay) + print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f"%(sum(param.numel() for param in self.model.parameters()) / 1024 / 1024)) + + def train_network(self, loader, epoch, **kwargs): + self.train() + self.scheduler.step(epoch - 1) + index, top1, loss = 0, 0, 0 + lr = self.optim.param_groups[0]['lr'] + for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1): + self.zero_grad() + audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) # feedForward + visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) + audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed) + outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + outsA = self.model.forward_audio_backend(audioEmbed) + outsV = self.model.forward_visual_backend(visualEmbed) + labels = labels[0].reshape((-1)).cuda() # Loss + nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels) + nlossA = self.lossA.forward(outsA, labels) + nlossV = self.lossV.forward(outsV, labels) + nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV + loss += nloss.detach().cpu().numpy() + top1 += prec + nloss.backward() + self.optim.step() + index += len(labels) + sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \ + " [%2d] Lr: %5f, Training: %.2f%%, " %(epoch, lr, 100 * (num / loader.__len__())) + \ + " Loss: %.5f, ACC: %2.2f%% \r" %(loss/(num), 100 * (top1/index))) + sys.stderr.flush() + sys.stdout.write("\n") + return loss/num, lr + + def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs): + self.eval() + predScores = [] + for audioFeature, visualFeature, labels in tqdm.tqdm(loader): + with torch.no_grad(): + audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) + visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) + audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed) + outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + labels = labels[0].reshape((-1)).cuda() + _, predScore, _, _ = self.lossAV.forward(outsAV, labels) + predScore = predScore[:,1].detach().cpu().numpy() + predScores.extend(predScore) + evalLines = open(evalOrig).read().splitlines()[1:] + labels = [] + labels = pandas.Series( ['SPEAKING_AUDIBLE' for line in evalLines]) + scores = pandas.Series(predScores) + evalRes = pandas.read_csv(evalOrig) + evalRes['score'] = scores + evalRes['label'] = labels + evalRes.drop(['label_id'], axis=1,inplace=True) + evalRes.drop(['instance_id'], axis=1,inplace=True) + evalRes.to_csv(evalCsvSave, index=False) + cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s "%(evalOrig, evalCsvSave) + mAP = float(str(subprocess.run(cmd, shell=True, capture_output =True).stdout).split(' ')[2][:5]) + return mAP + + def saveParameters(self, path): + torch.save(self.state_dict(), path) + + def loadParameters(self, path): + selfState = self.state_dict() + loadedState = torch.load(path) + for name, param in loadedState.items(): + origName = name; + if name not in selfState: + name = name.replace("module.", "") + if name not in selfState: + print("%s is not in the model."%origName) + continue + if selfState[name].size() != loadedState[origName].size(): + sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s"%(origName, selfState[name].size(), loadedState[origName].size())) + continue + selfState[name].copy_(param) diff --git a/talknet-asd/talknet_asd_cpu.onnx b/talknet-asd/talknet_asd_cpu.onnx new file mode 100644 index 0000000000000000000000000000000000000000..51164ab7e6ed1e2000fe96ba30074325d7a57b5d --- /dev/null +++ b/talknet-asd/talknet_asd_cpu.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:364fd414d4b952015b2fd784a1558f546407ecb5983916eb60d0709a849e3e0f +size 63213647 diff --git a/talknet-asd/trainTalkNet.py b/talknet-asd/trainTalkNet.py new file mode 100644 index 0000000000000000000000000000000000000000..ffea49e2e5472d20e8229a384776bc06be4cfe84 --- /dev/null +++ b/talknet-asd/trainTalkNet.py @@ -0,0 +1,86 @@ +import time, os, torch, argparse, warnings, glob + +from dataLoader import train_loader, val_loader +from utils.tools import * +from talkNet import talkNet + +def main(): + # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer + warnings.filterwarnings("ignore") + + parser = argparse.ArgumentParser(description = "TalkNet Training") + # Training details + parser.add_argument('--lr', type=float, default=0.0001,help='Learning rate') + parser.add_argument('--lrDecay', type=float, default=0.95, help='Learning rate decay rate') + parser.add_argument('--maxEpoch', type=int, default=25, help='Maximum number of epochs') + parser.add_argument('--testInterval', type=int, default=1, help='Test and save every [testInterval] epochs') + parser.add_argument('--batchSize', type=int, default=2500, help='Dynamic batch size, default is 2500 frames, other batchsize (such as 1500) will not affect the performance') + parser.add_argument('--nDataLoaderThread', type=int, default=4, help='Number of loader threads') + # Data path + parser.add_argument('--dataPathAVA', type=str, default="/data08/AVA", help='Save path of AVA dataset') + parser.add_argument('--savePath', type=str, default="exps/exp1") + # Data selection + parser.add_argument('--evalDataType', type=str, default="val", help='Only for AVA, to choose the dataset for evaluation, val or test') + # For download dataset only, for evaluation only + parser.add_argument('--downloadAVA', dest='downloadAVA', action='store_true', help='Only download AVA dataset and do related preprocess') + parser.add_argument('--evaluation', dest='evaluation', action='store_true', help='Only do evaluation by using pretrained model [pretrain_AVA.model]') + args = parser.parse_args() + # Data loader + args = init_args(args) + + if args.downloadAVA == True: + preprocess_AVA(args) + quit() + + loader = train_loader(trialFileName = args.trainTrialAVA, \ + audioPath = os.path.join(args.audioPathAVA , 'train'), \ + visualPath = os.path.join(args.visualPathAVA, 'train'), \ + **vars(args)) + trainLoader = torch.utils.data.DataLoader(loader, batch_size = 1, shuffle = True, num_workers = args.nDataLoaderThread) + + loader = val_loader(trialFileName = args.evalTrialAVA, \ + audioPath = os.path.join(args.audioPathAVA , args.evalDataType), \ + visualPath = os.path.join(args.visualPathAVA, args.evalDataType), \ + **vars(args)) + valLoader = torch.utils.data.DataLoader(loader, batch_size = 1, shuffle = False, num_workers = 16) + + if args.evaluation == True: + download_pretrain_model_AVA() + s = talkNet(**vars(args)) + s.loadParameters('pretrain_AVA.model') + print("Model %s loaded from previous state!"%('pretrain_AVA.model')) + mAP = s.evaluate_network(loader = valLoader, **vars(args)) + print("mAP %2.2f%%"%(mAP)) + quit() + + modelfiles = glob.glob('%s/model_0*.model'%args.modelSavePath) + modelfiles.sort() + if len(modelfiles) >= 1: + print("Model %s loaded from previous state!"%modelfiles[-1]) + epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1 + s = talkNet(epoch = epoch, **vars(args)) + s.loadParameters(modelfiles[-1]) + else: + epoch = 1 + s = talkNet(epoch = epoch, **vars(args)) + + mAPs = [] + scoreFile = open(args.scoreSavePath, "a+") + + while(1): + loss, lr = s.train_network(epoch = epoch, loader = trainLoader, **vars(args)) + + if epoch % args.testInterval == 0: + s.saveParameters(args.modelSavePath + "/model_%04d.model"%epoch) + mAPs.append(s.evaluate_network(epoch = epoch, loader = valLoader, **vars(args))) + print(time.strftime("%Y-%m-%d %H:%M:%S"), "%d epoch, mAP %2.2f%%, bestmAP %2.2f%%"%(epoch, mAPs[-1], max(mAPs))) + scoreFile.write("%d epoch, LR %f, LOSS %f, mAP %2.2f%%, bestmAP %2.2f%%\n"%(epoch, lr, loss, mAPs[-1], max(mAPs))) + scoreFile.flush() + + if epoch >= args.maxEpoch: + quit() + + epoch += 1 + +if __name__ == '__main__': + main() diff --git a/talknet-asd/utils/get_ava_active_speaker_performance.py b/talknet-asd/utils/get_ava_active_speaker_performance.py new file mode 100644 index 0000000000000000000000000000000000000000..2e66d1da9b2a06234b2f7afc6f1cecc81b0cf931 --- /dev/null +++ b/talknet-asd/utils/get_ava_active_speaker_performance.py @@ -0,0 +1,236 @@ +r"""Compute active speaker detection performance for the AVA dataset. +Please send any questions about this code to the Google Group ava-dataset-users: +https://groups.google.com/forum/#!forum/ava-dataset-users +Example usage: +python -O get_ava_active_speaker_performance.py \ +-g testdata/eval.csv \ +-p testdata/predictions.csv \ +-v +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import logging +import time, warnings +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +warnings.filterwarnings("ignore") + +def compute_average_precision(precision, recall): + """Compute Average Precision according to the definition in VOCdevkit. + Precision is modified to ensure that it does not decrease as recall + decrease. + Args: + precision: A float [N, 1] numpy array of precisions + recall: A float [N, 1] numpy array of recalls + Raises: + ValueError: if the input is not of the correct format + Returns: + average_precison: The area under the precision recall curve. NaN if + precision and recall are None. + """ + if precision is None: + if recall is not None: + raise ValueError("If precision is None, recall must also be None") + return np.NAN + + if not isinstance(precision, np.ndarray) or not isinstance( + recall, np.ndarray): + raise ValueError("precision and recall must be numpy array") + if precision.dtype != np.float or recall.dtype != np.float: + raise ValueError("input must be float numpy array.") + if len(precision) != len(recall): + raise ValueError("precision and recall must be of the same size.") + if not precision.size: + return 0.0 + if np.amin(precision) < 0 or np.amax(precision) > 1: + raise ValueError("Precision must be in the range of [0, 1].") + if np.amin(recall) < 0 or np.amax(recall) > 1: + raise ValueError("recall must be in the range of [0, 1].") + if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): + raise ValueError("recall must be a non-decreasing array") + + recall = np.concatenate([[0], recall, [1]]) + precision = np.concatenate([[0], precision, [0]]) + + # Smooth precision to be monotonically decreasing. + for i in range(len(precision) - 2, -1, -1): + precision[i] = np.maximum(precision[i], precision[i + 1]) + + indices = np.where(recall[1:] != recall[:-1])[0] + 1 + average_precision = np.sum( + (recall[indices] - recall[indices - 1]) * precision[indices]) + return average_precision + + +def load_csv(filename, column_names): + """Loads CSV from the filename using given column names. + Adds uid column. + Args: + filename: Path to the CSV file to load. + column_names: A list of column names for the data. + Returns: + df: A Pandas DataFrame containing the data. + """ + # Here and elsewhere, df indicates a DataFrame variable. + + df = pd.read_csv(filename, usecols=column_names) + #df = pd.read_csv(filename, header=None, names=column_names) + + # Creates a unique id from frame timestamp and entity id. + df["uid"] = (df["frame_timestamp"].map(str) + ":" + df["entity_id"]) + return df + + +def eq(a, b, tolerance=1e-09): + """Returns true if values are approximately equal.""" + return abs(a - b) <= tolerance + + +def merge_groundtruth_and_predictions(df_groundtruth, df_predictions): + """Merges groundtruth and prediction DataFrames. + The returned DataFrame is merged on uid field and sorted in descending order + by score field. Bounding boxes are checked to make sure they match between + groundtruth and predictions. + Args: + df_groundtruth: A DataFrame with groundtruth data. + df_predictions: A DataFrame with predictions data. + Returns: + df_merged: A merged DataFrame, with rows matched on uid column. + """ + if df_groundtruth["uid"].count() != df_predictions["uid"].count(): + raise ValueError( + "Groundtruth and predictions CSV must have the same number of " + "unique rows.") + # print(df_predictions["label"].unique()) + if df_predictions["label"].unique() != ["SPEAKING_AUDIBLE"]: + raise ValueError( + "Predictions CSV must contain only SPEAKING_AUDIBLE label.") + + if df_predictions["score"].count() < df_predictions["uid"].count(): + raise ValueError("Predictions CSV must contain score value for every row.") + + # Merges groundtruth and predictions on uid, validates that uid is unique + # in both frames, and sorts the resulting frame by the predictions score. + df_merged = df_groundtruth.merge( + df_predictions, + on="uid", + suffixes=("_groundtruth", "_prediction"), + validate="1:1").sort_values( + by=["score"], ascending=False).reset_index() + # Validates that bounding boxes in ground truth and predictions match for the + # same uids. + df_merged["bounding_box_correct"] = np.where( + eq(df_merged["entity_box_x1_groundtruth"], + df_merged["entity_box_x1_prediction"]) + & eq(df_merged["entity_box_x2_groundtruth"], + df_merged["entity_box_x2_prediction"]) + & eq(df_merged["entity_box_y1_groundtruth"], + df_merged["entity_box_y1_prediction"]) + & eq(df_merged["entity_box_y2_groundtruth"], + df_merged["entity_box_y2_prediction"]), True, False) + + if (~df_merged["bounding_box_correct"]).sum() > 0: + raise ValueError( + "Mismatch between groundtruth and predictions bounding boxes found at " + + str(list(df_merged[~df_merged["bounding_box_correct"]]["uid"]))) + + return df_merged + + +def get_all_positives(df_merged): + """Counts all positive examples in the groundtruth dataset.""" + return df_merged[df_merged["label_groundtruth"] == + "SPEAKING_AUDIBLE"]["uid"].count() + + +def calculate_precision_recall(df_merged): + """Calculates precision and recall arrays going through df_merged row-wise.""" + all_positives = get_all_positives(df_merged) + # Populates each row with 1 if this row is a true positive + # (at its score level). + df_merged["is_tp"] = np.where( + (df_merged["label_groundtruth"] == "SPEAKING_AUDIBLE") & + (df_merged["label_prediction"] == "SPEAKING_AUDIBLE"), 1, 0) + + # Counts true positives up to and including that row. + df_merged["tp"] = df_merged["is_tp"].cumsum() + + # Calculates precision for every row counting true positives up to + # and including that row over the index (1-based) of that row. + df_merged["precision"] = df_merged["tp"] / (df_merged.index + 1) + # Calculates recall for every row counting true positives up to + # and including that row over all positives in the groundtruth dataset. + + df_merged["recall"] = df_merged["tp"] / all_positives + logging.info( + "\n%s\n", + df_merged.head(10)[[ + "uid", "score", "label_groundtruth", "is_tp", "tp", "precision", + "recall" + ]]) + + return np.array(df_merged["precision"]), np.array(df_merged["recall"]) + + +def run_evaluation(groundtruth, predictions): + """Runs AVA Active Speaker evaluation, printing average precision result.""" + df_groundtruth = load_csv( + groundtruth, + column_names=[ + "video_id", "frame_timestamp", "entity_box_x1", "entity_box_y1", + "entity_box_x2", "entity_box_y2", "label", "entity_id" + ]) + df_predictions = load_csv( + predictions, + column_names=[ + "video_id", "frame_timestamp", "entity_box_x1", "entity_box_y1", + "entity_box_x2", "entity_box_y2", "label", "entity_id", "score" + ]) + df_merged = merge_groundtruth_and_predictions(df_groundtruth, df_predictions) + precision, recall = calculate_precision_recall(df_merged) + mAP = 100 * compute_average_precision(precision, recall) + print("average precision: %2.2f%%"%(mAP)) + return mAP + + +def parse_arguments(): + """Parses command-line flags. + Returns: + args: a named tuple containing three file objects args.labelmap, + args.groundtruth, and args.detections. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "-g", + "--groundtruth", + help="CSV file containing ground truth.", + type=argparse.FileType("r"), + required=True) + parser.add_argument( + "-p", + "--predictions", + help="CSV file containing active speaker predictions.", + type=argparse.FileType("r"), + required=True) + parser.add_argument( + "-v", "--verbose", help="Increase output verbosity.", action="store_true") + return parser.parse_args() + + +def main(): + start = time.time() + args = parse_arguments() + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + del args.verbose + mAP = run_evaluation(**vars(args)) + logging.info("Computed in %s seconds", time.time() - start) + return mAP + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/talknet-asd/utils/overall.png b/talknet-asd/utils/overall.png new file mode 100644 index 0000000000000000000000000000000000000000..e86b409c1ab66f4f415c1488f804ae164ca6e590 --- /dev/null +++ b/talknet-asd/utils/overall.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1ad1b3fdae7bcd23d0a4ac93a26e5e72f7adf7d835d55f0628079e6776f5267 +size 166850 diff --git a/talknet-asd/utils/tools.py b/talknet-asd/utils/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..8e7117854ca2a61e224c0057a5986a5d95af3b42 --- /dev/null +++ b/talknet-asd/utils/tools.py @@ -0,0 +1,186 @@ +import os, subprocess, glob, pandas, tqdm, cv2, numpy +from scipy.io import wavfile + +def init_args(args): + # The details for the following folders/files can be found in the annotation of the function 'preprocess_AVA' below + args.modelSavePath = os.path.join(args.savePath, 'model') + args.scoreSavePath = os.path.join(args.savePath, 'score.txt') + args.trialPathAVA = os.path.join(args.dataPathAVA, 'csv') + args.audioOrigPathAVA = os.path.join(args.dataPathAVA, 'orig_audios') + args.visualOrigPathAVA= os.path.join(args.dataPathAVA, 'orig_videos') + args.audioPathAVA = os.path.join(args.dataPathAVA, 'clips_audios') + args.visualPathAVA = os.path.join(args.dataPathAVA, 'clips_videos') + args.trainTrialAVA = os.path.join(args.trialPathAVA, 'train_loader.csv') + + if args.evalDataType == 'val': + args.evalTrialAVA = os.path.join(args.trialPathAVA, 'val_loader.csv') + args.evalOrig = os.path.join(args.trialPathAVA, 'val_orig.csv') + args.evalCsvSave = os.path.join(args.savePath, 'val_res.csv') + else: + args.evalTrialAVA = os.path.join(args.trialPathAVA, 'test_loader.csv') + args.evalOrig = os.path.join(args.trialPathAVA, 'test_orig.csv') + args.evalCsvSave = os.path.join(args.savePath, 'test_res.csv') + + os.makedirs(args.modelSavePath, exist_ok = True) + os.makedirs(args.dataPathAVA, exist_ok = True) + return args + + +def download_pretrain_model_AVA(): + if os.path.isfile('pretrain_AVA.model') == False: + Link = "1NVIkksrD3zbxbDuDbPc_846bLfPSZcZm" + cmd = "gdown --id %s -O %s"%(Link, 'pretrain_AVA.model') + subprocess.call(cmd, shell=True, stdout=None) + +def preprocess_AVA(args): + # This preprocesstion is modified based on this [repository](https://github.com/fuankarion/active-speakers-context). + # The required space is 302 G. + # If you do not have enough space, you can delate `orig_videos`(167G) when you get `clips_videos(85G)`. + # also you can delate `orig_audios`(44G) when you get `clips_audios`(6.4G). + # So the final space is less than 100G. + # The AVA dataset will be saved in 'AVApath' folder like the following format: + # ``` + # ├── clips_audios (The audio clips cut from the original movies) + # │   ├── test + # │   ├── train + # │   └── val + # ├── clips_videos (The face clips cut from the original movies, be save in the image format, frame-by-frame) + # │   ├── test + # │   ├── train + # │   └── val + # ├── csv + # │   ├── test_file_list.txt (name of the test videos) + # │   ├── test_loader.csv (The csv file we generated to load data for testing) + # │   ├── test_orig.csv (The combination of the given test csv files) + # │   ├── train_loader.csv (The csv file we generated to load data for training) + # │   ├── train_orig.csv (The combination of the given training csv files) + # │   ├── trainval_file_list.txt (name of the train/val videos) + # │   ├── val_loader.csv (The csv file we generated to load data for validation) + # │   └── val_orig.csv (The combination of the given validation csv files) + # ├── orig_audios (The original audios from the movies) + # │   ├── test + # │   └── trainval + # └── orig_videos (The original movies) + # ├── test + # └── trainval + # ``` + + download_csv(args) # Take 1 minute + download_videos(args) # Take 6 hours + extract_audio(args) # Take 1 hour + extract_audio_clips(args) # Take 3 minutes + extract_video_clips(args) # Take about 2 days + +def download_csv(args): + # Take 1 minute to download the required csv files + Link = "1C1cGxPHaJAl1NQ2i7IhRgWmdvsPhBCUy" + cmd = "gdown --id %s -O %s"%(Link, args.dataPathAVA + '/csv.tar.gz') + subprocess.call(cmd, shell=True, stdout=None) + cmd = "tar -xzvf %s -C %s"%(args.dataPathAVA + '/csv.tar.gz', args.dataPathAVA) + subprocess.call(cmd, shell=True, stdout=None) + os.remove(args.dataPathAVA + '/csv.tar.gz') + +def download_videos(args): + # Take 6 hours to download the original movies, follow this repository: https://github.com/cvdfoundation/ava-dataset + for dataType in ['trainval', 'test']: + fileList = open('%s/%s_file_list.txt'%(args.trialPathAVA, dataType)).read().splitlines() + outFolder = '%s/%s'%(args.visualOrigPathAVA, dataType) + for fileName in fileList: + cmd = "wget -P %s https://s3.amazonaws.com/ava-dataset/%s/%s"%(outFolder, dataType, fileName) + subprocess.call(cmd, shell=True, stdout=None) + +def extract_audio(args): + # Take 1 hour to extract the audio from movies + for dataType in ['trainval', 'test']: + inpFolder = '%s/%s'%(args.visualOrigPathAVA, dataType) + outFolder = '%s/%s'%(args.audioOrigPathAVA, dataType) + os.makedirs(outFolder, exist_ok = True) + videos = glob.glob("%s/*"%(inpFolder)) + for videoPath in tqdm.tqdm(videos): + audioPath = '%s/%s'%(outFolder, videoPath.split('/')[-1].split('.')[0] + '.wav') + cmd = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads 8 %s -loglevel panic" % (videoPath, audioPath)) + subprocess.call(cmd, shell=True, stdout=None) + + +def extract_audio_clips(args): + # Take 3 minutes to extract the audio clips + dic = {'train':'trainval', 'val':'trainval', 'test':'test'} + for dataType in ['train', 'val', 'test']: + df = pandas.read_csv(os.path.join(args.trialPathAVA, '%s_orig.csv'%(dataType)), engine='python') + dfNeg = pandas.concat([df[df['label_id'] == 0], df[df['label_id'] == 2]]) + dfPos = df[df['label_id'] == 1] + insNeg = dfNeg['instance_id'].unique().tolist() + insPos = dfPos['instance_id'].unique().tolist() + df = pandas.concat([dfPos, dfNeg]).reset_index(drop=True) + df = df.sort_values(['entity_id', 'frame_timestamp']).reset_index(drop=True) + entityList = df['entity_id'].unique().tolist() + df = df.groupby('entity_id') + audioFeatures = {} + outDir = os.path.join(args.audioPathAVA, dataType) + audioDir = os.path.join(args.audioOrigPathAVA, dic[dataType]) + for l in df['video_id'].unique().tolist(): + d = os.path.join(outDir, l[0]) + if not os.path.isdir(d): + os.makedirs(d) + for entity in tqdm.tqdm(entityList, total = len(entityList)): + insData = df.get_group(entity) + videoKey = insData.iloc[0]['video_id'] + start = insData.iloc[0]['frame_timestamp'] + end = insData.iloc[-1]['frame_timestamp'] + entityID = insData.iloc[0]['entity_id'] + insPath = os.path.join(outDir, videoKey, entityID+'.wav') + if videoKey not in audioFeatures.keys(): + audioFile = os.path.join(audioDir, videoKey+'.wav') + sr, audio = wavfile.read(audioFile) + audioFeatures[videoKey] = audio + audioStart = int(float(start)*sr) + audioEnd = int(float(end)*sr) + audioData = audioFeatures[videoKey][audioStart:audioEnd] + wavfile.write(insPath, sr, audioData) + +def extract_video_clips(args): + # Take about 2 days to crop the face clips. + # You can optimize this code to save time, while this process is one-time. + # If you do not need the data for the test set, you can only deal with the train and val part. That will take 1 day. + # This procession may have many warning info, you can just ignore it. + dic = {'train':'trainval', 'val':'trainval', 'test':'test'} + for dataType in ['train', 'val', 'test']: + df = pandas.read_csv(os.path.join(args.trialPathAVA, '%s_orig.csv'%(dataType))) + dfNeg = pandas.concat([df[df['label_id'] == 0], df[df['label_id'] == 2]]) + dfPos = df[df['label_id'] == 1] + insNeg = dfNeg['instance_id'].unique().tolist() + insPos = dfPos['instance_id'].unique().tolist() + df = pandas.concat([dfPos, dfNeg]).reset_index(drop=True) + df = df.sort_values(['entity_id', 'frame_timestamp']).reset_index(drop=True) + entityList = df['entity_id'].unique().tolist() + df = df.groupby('entity_id') + outDir = os.path.join(args.visualPathAVA, dataType) + audioDir = os.path.join(args.visualOrigPathAVA, dic[dataType]) + for l in df['video_id'].unique().tolist(): + d = os.path.join(outDir, l[0]) + if not os.path.isdir(d): + os.makedirs(d) + for entity in tqdm.tqdm(entityList, total = len(entityList)): + insData = df.get_group(entity) + videoKey = insData.iloc[0]['video_id'] + entityID = insData.iloc[0]['entity_id'] + videoDir = os.path.join(args.visualOrigPathAVA, dic[dataType]) + videoFile = glob.glob(os.path.join(videoDir, '{}.*'.format(videoKey)))[0] + V = cv2.VideoCapture(videoFile) + insDir = os.path.join(os.path.join(outDir, videoKey, entityID)) + if not os.path.isdir(insDir): + os.makedirs(insDir) + j = 0 + for _, row in insData.iterrows(): + imageFilename = os.path.join(insDir, str("%.2f"%row['frame_timestamp'])+'.jpg') + V.set(cv2.CAP_PROP_POS_MSEC, row['frame_timestamp'] * 1e3) + _, frame = V.read() + h = numpy.size(frame, 0) + w = numpy.size(frame, 1) + x1 = int(row['entity_box_x1'] * w) + y1 = int(row['entity_box_y1'] * h) + x2 = int(row['entity_box_x2'] * w) + y2 = int(row['entity_box_y2'] * h) + face = frame[y1:y2, x1:x2, :] + j = j+1 + cv2.imwrite(imageFilename, face) diff --git a/yolo-face-person-detector/.gitattributes b/yolo-face-person-detector/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..4e70313b040459176c367473f81d0d74500eed2b --- /dev/null +++ b/yolo-face-person-detector/.gitattributes @@ -0,0 +1,38 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +images/image.png filter=lfs diff=lfs merge=lfs -text +images/out_mCFMn0UkRt0.avi filter=lfs diff=lfs merge=lfs -text +images/output.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/yolo-face-person-detector/.gitignore b/yolo-face-person-detector/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6be48c0246aebed8cf985a18f77ca9b2803b2616 --- /dev/null +++ b/yolo-face-person-detector/.gitignore @@ -0,0 +1,2 @@ +.DS_Store +.idea/ diff --git a/yolo-face-person-detector/README.md b/yolo-face-person-detector/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1a1222d680f70d02cd3f65c055771f9695fec592 --- /dev/null +++ b/yolo-face-person-detector/README.md @@ -0,0 +1,172 @@ +--- +license: agpl-3.0 +pipeline_tag: object-detection +library_name: ultralytics +tags: +- yolo +- yolov8 +- ultralytics +- object-detection +- computer-vision +- face-detection +- person-detection +--- + +# YOLOv8x Face & Person Detector + +
+ + Open in Spaces + + YOLO Detection Example + +
+ +## Model Description + +This model is a fine-tuned version of **YOLOv8x** specialized in detecting two specific classes: **Face** and **Person**. + +It has been trained on a large-scale proprietary dataset consisting of approximately 150,000 images. +The high capacity of the YOLOv8x architecture combined with a diverse proprietary dataset ensures high accuracy and robustness in various scenarios. + + +## How to Use + +### Installation +```bash +pip install ultralytics==8.1.0 torch==2.5.1 transformers huggingface_hub +``` + +### 1. Use with transformers + +You can load the model using the Hugging Face transformers library by enabling custom code execution. + +```python +from transformers import AutoModel +from PIL import Image +import torch + +# 1. Load model with trust_remote_code=True +torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = AutoModel.from_pretrained( + "iitolstykh/YOLO-Face-Person-Detector", + trust_remote_code=True, + dtype=torch_dtype, +).to(device) + +# 2. Load image (You can use URL, PIL.Image or np.ndarray) +image = Image.open("path/to/your/image.jpg") +# image = cv2.imread("path/to/your/image.jpg") + +# 3. Perform inference +results = model(image, conf=0.4, iou=0.7)[0] + +# 4. Process results +print("Found objects:", [results.names[int(det.cls)] for det in results.boxes]) +print("Boxes:", results.boxes) +# render_result(model=model.yolo, image=image, result=results).show() +``` + +### 2. Use with ultralytics + +If you prefer the standard Ultralytics API, you can download the weights from the Hub and load them directly. + +```python +from ultralytics import YOLO +from huggingface_hub import hf_hub_download +import torch + +# 1. Download model weights +model_path = hf_hub_download( + repo_id="iitolstykh/YOLO-Face-Person-Detector", + filename="yolov8x_person_face.pt", + repo_type="model" +) + +# 2. Load model +torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = YOLO(model_path) +model.fuse() +if torch_dtype is torch.float16: + model.model = model.model.half() +model.to(device) + +# 3. Perform inference +image = 'https://variety.com/wp-content/uploads/2023/04/MCDNOHA_SP001.jpg' +results = model.predict(image, conf=0.4, iou=0.7, half=torch_dtype is torch.float16) + +# 4. Show results +for result in results: + boxes = result.boxes + print("Found objects:", [result.names[int(c)] for c in boxes.cls]) +``` + +### 3. Use with ultralyticsplus + +This method automatically handles model downloading for ultralytics YOLO model. + +```bash +pip install ultralyticsplus==0.1.0 +``` + +```python +from ultralyticsplus import YOLO, render_result + +# 1. Load model +model = YOLO('iitolstykh/YOLO-Face-Person-Detector') + +# 2. Set model parameters +model.overrides['conf'] = 0.4 +model.overrides['iou'] = 0.7 +model.overrides['max_det'] = 100 + +# 3. Set image (You can use URL, PIL.Image or np.ndarray) +image = 'https://variety.com/wp-content/uploads/2023/04/MCDNOHA_SP001.jpg' + +# 4. Perform inference +results = model.predict(image) + +# 5. Show results +print("Found objects:", [results[0].names[int(det.cls)] for det in results[0].boxes]) +render = render_result(model=model, image=image, result=results[0]) +render.show() +``` + +## License + +This model is based on the Ultralytics YOLOv8 architecture and inherits the **AGPL-3.0 License**. + +Please refer to the official [Ultralytics Licensing](https://huggingface.co/Ultralytics/YOLOv8#license) details for more information regarding commercial usage and restrictions. + +## Citation + +🌟 If you find our work helpful, please consider citing our papers and leaving valuable stars + +```bibtex +@article{mivolo2023, + Author = {Maksim Kuprashevich and Irina Tolstykh}, + Title = {MiVOLO: Multi-input Transformer for Age and Gender Estimation}, + Year = {2023}, + Eprint = {arXiv:2307.04616}, +} +``` +```bibtex +@article{mivolo2024, + Author = {Maksim Kuprashevich and Grigorii Alekseenko and Irina Tolstykh}, + Title = {Beyond Specialization: Assessing the Capabilities of MLLMs in Age and Gender Estimation}, + Year = {2024}, + Eprint = {arXiv:2403.02302}, +} +``` +```bibtex +@article{cerberusdet, + Author = {Irina Tolstykh,Michael Chernyshov,Maksim Kuprashevich}, + Title = {CerberusDet: Unified Multi-Dataset Object Detection}, + Year = {2024}, + Eprint = {arXiv:2407.12632}, +} +``` diff --git a/yolo-face-person-detector/config.json b/yolo-face-person-detector/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb8af9ce4fd81c756460cffa90107a21d68ce6a1 --- /dev/null +++ b/yolo-face-person-detector/config.json @@ -0,0 +1,24 @@ +{ + "architectures": [ + "YOLOV8ForObjectDetection" + ], + "auto_map": { + "AutoConfig": "configuration_yolo.YoloV8Config", + "AutoModel": "modeling_yolo.YOLOV8ForObjectDetection" + }, + + "names": { + "0": "person", + "1": "face" + }, + + "model_config": "yolov8x.yaml", + "num_classes": 2, + "task": "detect", + "input_size": 640, + + "model_type": "yolov8", + "torch_dtype": "float16", + "transformers_version": "4.57.1", + "verbose": 0 + } \ No newline at end of file diff --git a/yolo-face-person-detector/configuration_yolo.py b/yolo-face-person-detector/configuration_yolo.py new file mode 100644 index 0000000000000000000000000000000000000000..91b2e1e02595f53f9ac3968a3a43ed24d4b552cb --- /dev/null +++ b/yolo-face-person-detector/configuration_yolo.py @@ -0,0 +1,30 @@ +"""A HuggingFace-style model configuration.""" +from typing import Any, Dict, List +from transformers import PretrainedConfig + + +class YoloV8Config(PretrainedConfig): + model_type = 'yolov8' + + def __init__( + self, + model_config: str = "yolov8x.yaml", + task: str = 'detect', + num_classes: int = 2, + num_channels: int = 3, + input_size: int = 640, + names: Dict = {"0": "person", "1": "face"}, + stride: List[int] = [8, 16, 32], + verbose: bool = False, + **kwargs: Any + ): + self.input_size = input_size + self.num_channels = num_channels + self.task = task + self.model_config = model_config + self.num_classes = num_classes + self.stride = stride + self.verbose = bool(verbose) + self.names = {int(key): value for key, value in names.items()} + + super().__init__(**kwargs) diff --git a/yolo-face-person-detector/images/image.png b/yolo-face-person-detector/images/image.png new file mode 100644 index 0000000000000000000000000000000000000000..dbcce5df4cecae81f87fde19030234dbee85d9fe --- /dev/null +++ b/yolo-face-person-detector/images/image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3074ec787948ae62c4388d4a48777ea0be10c2c538a67a9c5f18062b5b079122 +size 900871 diff --git a/yolo-face-person-detector/images/output.mp4 b/yolo-face-person-detector/images/output.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..19ab604650493f53b0f43c39e564bdba21fb0e65 --- /dev/null +++ b/yolo-face-person-detector/images/output.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3bc956f838fb79c4936b19229f1ea02f6b4626249b509b28bc2a6eab0186e02 +size 17398399 diff --git a/yolo-face-person-detector/model.safetensors b/yolo-face-person-detector/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e1cad4001919ee90b6a48a67b96a12e0a186e30b --- /dev/null +++ b/yolo-face-person-detector/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:278524fbd3c5b736493294983f91ef49a3fe91b5c97188ec8cc8be58288577fd +size 136486204 diff --git a/yolo-face-person-detector/modeling_yolo.py b/yolo-face-person-detector/modeling_yolo.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b4604f27321ac8960cc887f72e6f7cb871bd57 --- /dev/null +++ b/yolo-face-person-detector/modeling_yolo.py @@ -0,0 +1,249 @@ +import os +from typing import Union, Optional, Tuple, List + +import numpy as np +from PIL import Image +import torch.nn as nn +import torch + +from ultralytics import YOLO +from ultralytics.nn.tasks import ( + DetectionModel, + BaseModel, + yaml_model_load, + LOGGER, + parse_model, + deepcopy, + Detect, + Segment, + Pose, + OBB, + initialize_weights, +) +from ultralytics.engine.results import Results +from .configuration_yolo import YoloV8Config +from transformers import PreTrainedModel + +os.unsetenv("CUBLAS_WORKSPACE_CONFIG") + + +class YOLOV8DetectionModel(BaseModel): + _predict_augment = DetectionModel._predict_augment + _descale_pred = DetectionModel._descale_pred + _clip_augmented = DetectionModel._clip_augmented + init_criterion = DetectionModel.init_criterion + + # model, input channels, number of classes + def __init__(self, cfg="yolov8n.yaml", ch=3, nc=None, verbose=True, stride: List[int]=[8, 16, 32]): + """ + Initializes the YOLOv8 detection model with the given configuration and parameters. + + This constructor parses the model configuration (YAML), sets up the input channels and number of classes, + builds the model architecture, and initializes the strides and weights. + + Args: + cfg (str | dict): Path to the YAML configuration file or the configuration dictionary itself. Defaults to "yolov8n.yaml". + ch (int): Number of input channels. Defaults to 3. + nc (int, optional): Number of classes. If provided, overrides the value in the YAML config. Defaults to None. + verbose (bool): Whether to print model details during initialization. Defaults to True. + stride (List[int]): A list of stride values for the detection layer. Defaults to [8, 16, 32]. + """ + super().__init__() + + self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg) # cfg dict + + # Define model + ch = self.yaml["ch"] = self.yaml.get("ch", ch) # input channels + if nc and nc != self.yaml["nc"]: + LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}") + self.yaml["nc"] = nc # override YAML value + self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist + self.names = {i: f"{i}" for i in range(self.yaml["nc"])} # default names dict + self.inplace = self.yaml.get("inplace", True) + + # Build strides + m = self.model[-1] # Detect() + if isinstance(m, (Detect, Segment, Pose, OBB)): + m.inplace = self.inplace + m.stride = torch.tensor(stride, dtype=torch.float32) # forward + self.stride = m.stride + m.bias_init() # only run once + else: + self.stride = torch.Tensor([32]) # default stride for i.e. RTDETR + + # Init weights, biases + initialize_weights(self) + if verbose: + self.info() + LOGGER.info("") + + +class YOLOWrapper(YOLO): + + def __init__(self, model: torch.nn.Module, task=None) -> None: + """ + Initializes the YOLO wrapper around a specific PyTorch model. + + This allows a standard PyTorch module to be used within the Ultralytics YOLO ecosystem + by overriding the default initialization to accept an existing model object. + + Args: + model (torch.nn.Module): The PyTorch model instance to wrap. + task (str, optional): The specific task type for the YOLO model (e.g., 'detect'). Defaults to None. + """ + super().__init__(model="", task=task) + self.model = model + + +class YOLOV8PreTrainedModel(PreTrainedModel): + config_class = YoloV8Config + base_model_prefix = 'model' + _no_split_modules = ['model'] + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initializes the weights of the model layers.""" + + if module is nn.Conv2d: + pass # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif module is nn.BatchNorm2d: + module.eps = 1e-3 + module.momentum = 0.03 + elif module in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]: + module.inplace = True + + +class YOLOV8ForObjectDetection(YOLOV8PreTrainedModel): + + def __init__(self, config: YoloV8Config): + """ + Initializes the YOLOv8 object detection model based on the provided configuration. + + Args: + config (YoloV8Config): The configuration object containing model parameters, channels, classes, and strides. + """ + super().__init__(config) + self.config = config + + # initialize a model + self.model = YOLOV8DetectionModel( + cfg=self.config.model_config, + ch=self.config.num_channels, + nc=self.config.num_classes, + verbose=self.config.verbose, + stride=self.config.stride, + ) + self.model.names = self.config.names + self.yolo: YOLOWrapper = None + self.half = False + + # Initialize weights and apply final processing + self.post_init() + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwargs): # type: ignore + """Loads a pretrained YOLOv8 model from a local path or the Hugging Face Hub and initializes the wrapper. + + This class method loads the model weights, creates the `YOLOWrapper` instance, and configures + task-specific overrides to enable inference immediately after loading. + + Args: + pretrained_model_name_or_path (str): The name or path of the pretrained model. + model_args: Additional positional arguments passed to parent class. + kwargs: Additional keyword arguments passed to parent class. + + Returns: + YOLOV8ForObjectDetection: The initialized model with loaded weights and active YOLO wrapper. + """ + + dtype = torch.float32 + if "dtype" in kwargs: + dtype = kwargs.pop("dtype") + elif "torch_dtype" in kwargs: + dtype = kwargs.pop("torch_dtype") + + fuse = True + inplace = True + + # set model weights + model = super().from_pretrained( + pretrained_model_name_or_path, + *model_args, + **kwargs, + dtype=torch.float32, # needed for model.fuse() + ) + + # fuse model + for module in model.model.modules(): + module.requires_grad_(False) + model.model = model.model.fuse().eval() if fuse and hasattr(model.model, "fuse") else model.model.eval() + + # module updates + for m in model.model.modules(): + t = type(m) + if t in (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Segment, Pose, OBB): + m.inplace = inplace + elif t is nn.Upsample and not hasattr(m, "recompute_scale_factor"): + m.recompute_scale_factor = None # torch 1.11.0 compatibility + + model.model.fp16 = True if dtype is torch.float16 else False + model.half = True if dtype is torch.float16 else False + + # initialize a wrapper + yolo = YOLOWrapper(model=model.model, task=model.config.task) + yolo.overrides["model"] = pretrained_model_name_or_path + yolo.overrides["task"] = model.config.task + yolo.overrides["half"] = True if dtype is torch.float16 else False + model.yolo = yolo + model.yolo.ckpt = pretrained_model_name_or_path + if dtype is torch.float16: + model.yolo.model = model.yolo.model.half() + + return model + + def forward( + self, + model_input: Optional[Union[Image.Image, np.ndarray, str]] = None, + return_dict: Optional[bool] = None, + conf: float = 0.4, + iou: float = 0.7, + max_det: int = 300, + verbose: bool = False, + **inference_kwargs, + ) -> Union[Tuple, Results]: + """ + Performs a forward pass (inference) on the input data using the wrapped YOLO model. + + This method handles image preprocessing, inference, and post-processing (NMS) based on the provided arguments. + It requires `from_pretrained` to have been called effectively to populate the internal YOLO wrapper. + + Args: + model_input (Image.Image | np.ndarray | str, optional): The input image(s). Accepts file paths, PIL Images, or NumPy arrays. + return_dict (bool, optional): Whether to return a dictionary (or Results object) instead of a tuple. Defaults to model config. + conf (float): Confidence threshold for Non-Maximum Suppression (NMS). Defaults to 0.4. + iou (float): IoU threshold for NMS. Defaults to 0.7. + max_det (int): Maximum number of detections allowed per image. Defaults to 100. + verbose (bool): Whether to print verbose output during inference. Defaults to False. + **inference_kwargs: Additional arguments supported by the Ultralytics predictor (e.g., `imgsz`, `device`). + See all available arguments at https://docs.ultralytics.com/usage/cfg. + Returns: + Union[Tuple, Results]: A tuple containing the `Results` object if `return_dict` is False, otherwise the `Results` object directly. + + Raises: + RuntimeError: If the internal YOLO wrapper is not initialized (e.g., model not loaded via `.from_pretrained()`). + """ + + if self.yolo is None: + raise RuntimeError("Call .from_pretrained(...) before forward().") + + # accepted image url, PIL.Image or np.ndarray image + assert isinstance(model_input, (Image.Image, np.ndarray, str)) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + detector_kwargs = {"conf": conf, "iou": iou, "verbose": verbose, "max_det": max_det} + detector_kwargs.update(inference_kwargs) + results: Results = self.yolo.predict(model_input, **detector_kwargs) + + if not return_dict: + return (results,) + + return results diff --git a/yolo-face-person-detector/requirements.txt b/yolo-face-person-detector/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c2827f6943a74a0a955e7cfedd00aafb754f424 --- /dev/null +++ b/yolo-face-person-detector/requirements.txt @@ -0,0 +1,4 @@ +ultralytics==8.1.0 +torch==2.5.1 +torchvision==0.20.1 +omegaconf diff --git a/yolo-face-person-detector/yolov8x_person_face.pt b/yolo-face-person-detector/yolov8x_person_face.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4fea0e2a9545fe4db35fc88ad60804e7209da8b --- /dev/null +++ b/yolo-face-person-detector/yolov8x_person_face.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2620f45609a65f909eb876bd7401308b5a8f3843ad5a03cb7416066a3e492989 +size 136716488